Total coverage: 145831 (8%)of 1851791
12 12 12 12 14 1 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 /* * net/tipc/diag.c: TIPC socket diag * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "socket.h" #include <linux/sock_diag.h> #include <linux/tipc_sockets_diag.h> static u64 __tipc_diag_gen_cookie(struct sock *sk) { u32 res[2]; sock_diag_save_cookie(sk, res); return *((u64 *)res); } static int __tipc_add_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); struct nlmsghdr *nlh; int err; nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, NLM_F_MULTI); if (!nlh) return -EMSGSIZE; err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); } static int tipc_sock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct tipc_sock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = tipc_dump_start, .dump = tipc_diag_dump, .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; } return -EOPNOTSUPP; } static const struct sock_diag_handler tipc_sock_diag_handler = { .owner = THIS_MODULE, .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; static int __init tipc_diag_init(void) { return sock_diag_register(&tipc_sock_diag_handler); } static void __exit tipc_diag_exit(void) { sock_diag_unregister(&tipc_sock_diag_handler); } module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("TIPC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC);
2320 2319 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); pidfs_exit(p); cgroup_release(p); thread_pid = get_pid(p->thread_pid); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); proc_flush_pid(thread_pid); put_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk) { struct core_state *core_state; /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ spin_lock_irq(&tsk->sighand->siglock); tsk->flags |= PF_POSTCOREDUMP; core_state = tsk->signal->core_state; spin_unlock_irq(&tsk->sighand->siglock); if (core_state) { struct core_thread self; self.task = current; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } spin_unlock_irq(&sighand->siglock); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); synchronize_group_exit(tsk, code); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); exit_mm(); if (group_dead) acct_process(); trace_sched_process_exit(tsk); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. * * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
41 40 35 42 54 54 4 5 9 9 3 7 7 7 7 2 36 36 38 38 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * Cryptographic API. * * Glue code for the SHA512 Secure Hash Algorithm assembler * implementation using supplemental SSE3 / AVX / AVX2 instructions. * * This file is based on sha512_generic.c * * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage void sha512_transform_ssse3(struct sha512_state *state, const u8 *data, int blocks); static int sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *sha512_xform) { struct sha512_state *sctx = shash_desc_ctx(desc); if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); /* * Make sure struct sha512_state begins directly with the SHA512 * 512-bit internal state, as this is what the asm functions expect. */ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); kernel_fpu_begin(); sha512_base_do_update(desc, data, len, sha512_xform); kernel_fpu_end(); return 0; } static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_block_fn *sha512_xform) { if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); if (len) sha512_base_do_update(desc, data, len, sha512_xform); sha512_base_do_finalize(desc, sha512_xform); kernel_fpu_end(); return sha512_base_finish(desc, out); } static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_ssse3); } static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_ssse3); } /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg sha512_ssse3_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", .cra_priority = 150, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", .cra_priority = 150, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) return crypto_register_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); return 0; } static void unregister_sha512_ssse3(void) { if (boot_cpu_has(X86_FEATURE_SSSE3)) crypto_unregister_shashes(sha512_ssse3_algs, ARRAY_SIZE(sha512_ssse3_algs)); } asmlinkage void sha512_transform_avx(struct sha512_state *state, const u8 *data, int blocks); static bool avx_usable(void) { if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { if (boot_cpu_has(X86_FEATURE_AVX)) pr_info("AVX detected but unusable.\n"); return false; } return true; } static int sha512_avx_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_avx); } static int sha512_avx_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_avx); } /* Add padding and return the message digest. */ static int sha512_avx_final(struct shash_desc *desc, u8 *out) { return sha512_avx_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx", .cra_priority = 160, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx_update, .final = sha512_avx_final, .finup = sha512_avx_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx", .cra_priority = 160, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int register_sha512_avx(void) { if (avx_usable()) return crypto_register_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); return 0; } static void unregister_sha512_avx(void) { if (avx_usable()) crypto_unregister_shashes(sha512_avx_algs, ARRAY_SIZE(sha512_avx_algs)); } asmlinkage void sha512_transform_rorx(struct sha512_state *state, const u8 *data, int blocks); static int sha512_avx2_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_update(desc, data, len, sha512_transform_rorx); } static int sha512_avx2_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return sha512_finup(desc, data, len, out, sha512_transform_rorx); } /* Add padding and return the message digest. */ static int sha512_avx2_final(struct shash_desc *desc, u8 *out) { return sha512_avx2_finup(desc, NULL, 0, out); } static struct shash_alg sha512_avx2_algs[] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-avx2", .cra_priority = 170, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = sha512_avx2_update, .final = sha512_avx2_final, .finup = sha512_avx2_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-avx2", .cra_priority = 170, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static bool avx2_usable(void) { if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI2)) return true; return false; } static int register_sha512_avx2(void) { if (avx2_usable()) return crypto_register_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); return 0; } static const struct x86_cpu_id module_cpu_ids[] = { X86_MATCH_FEATURE(X86_FEATURE_AVX2, NULL), X86_MATCH_FEATURE(X86_FEATURE_AVX, NULL), X86_MATCH_FEATURE(X86_FEATURE_SSSE3, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, module_cpu_ids); static void unregister_sha512_avx2(void) { if (avx2_usable()) crypto_unregister_shashes(sha512_avx2_algs, ARRAY_SIZE(sha512_avx2_algs)); } static int __init sha512_ssse3_mod_init(void) { if (!x86_match_cpu(module_cpu_ids)) return -ENODEV; if (register_sha512_ssse3()) goto fail; if (register_sha512_avx()) { unregister_sha512_ssse3(); goto fail; } if (register_sha512_avx2()) { unregister_sha512_avx(); unregister_sha512_ssse3(); goto fail; } return 0; fail: return -ENODEV; } static void __exit sha512_ssse3_mod_fini(void) { unregister_sha512_avx2(); unregister_sha512_avx(); unregister_sha512_ssse3(); } module_init(sha512_ssse3_mod_init); module_exit(sha512_ssse3_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, Supplemental SSE3 accelerated"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-ssse3"); MODULE_ALIAS_CRYPTO("sha512-avx"); MODULE_ALIAS_CRYPTO("sha512-avx2"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-ssse3"); MODULE_ALIAS_CRYPTO("sha384-avx"); MODULE_ALIAS_CRYPTO("sha384-avx2");
3 15 17 1 1 15 17 17 2 15 7 15 15 15 7 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = from_timer(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, msecs_to_jiffies(info->timeout * 1000) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER");
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty * path correctly just ensure @path is initialized: * * struct path path __free(path_put) = {}; */ #define __free_path_put path_put #endif /* _LINUX_PATH_H */
3 12 12 10 2 25 4 7 2 5 6 2 4 3 1 2 12 2 10 4 2 2 29 29 5 5 29 37 2 1 36 6 2 33 29 1 29 29 23 6 24 29 2 4 24 24 17 6 6 6 2 1 1 4 1 3 9 3 1 2 3 2 1 2 3 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; }
10 11 11 11 11 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate);
9 3 37 37 37 3 3 3 3 12 12 2 1 1 1 1 1 1 1 3 3 3 1 1 1 3 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 /* * net/tipc/monitor.c * * Copyright (c) 2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/genetlink.h> #include "core.h" #include "addr.h" #include "monitor.h" #include "bearer.h" #define MAX_MON_DOMAIN 64 #define MON_TIMEOUT 120000 #define MAX_PEER_DOWN_EVENTS 4 /* struct tipc_mon_domain: domain record to be transferred between peers * @len: actual size of domain record * @gen: current generation of sender's domain * @ack_gen: most recent generation of self's domain acked by peer * @member_cnt: number of domain member nodes described in this record * @up_map: bit map indicating which of the members the sender considers up * @members: identity of the domain members */ struct tipc_mon_domain { u16 len; u16 gen; u16 ack_gen; u16 member_cnt; u64 up_map; u32 members[MAX_MON_DOMAIN]; }; /* struct tipc_peer: state of a peer node and its domain * @addr: tipc node identity of peer * @head_map: shows which other nodes currently consider peer 'up' * @domain: most recent domain record from peer * @hash: position in hashed lookup list * @list: position in linked list, in circular ascending order by 'addr' * @applied: number of reported domain members applied on this monitor list * @is_up: peer is up as seen from this node * @is_head: peer is assigned domain head as seen from this node * @is_local: peer is in local domain and should be continuously monitored * @down_cnt: - numbers of other peers which have reported this on lost */ struct tipc_peer { u32 addr; struct tipc_mon_domain *domain; struct hlist_node hash; struct list_head list; u8 applied; u8 down_cnt; bool is_up; bool is_head; bool is_local; }; struct tipc_monitor { struct hlist_head peers[NODE_HTABLE_SIZE]; int peer_cnt; struct tipc_peer *self; rwlock_t lock; struct tipc_mon_domain cache; u16 list_gen; u16 dom_gen; struct net *net; struct timer_list timer; unsigned long timer_intv; }; static struct tipc_monitor *tipc_monitor(struct net *net, int bearer_id) { return tipc_net(net)->monitors[bearer_id]; } const int tipc_max_domain_size = sizeof(struct tipc_mon_domain); static inline u16 mon_cpu_to_le16(u16 val) { return (__force __u16)htons(val); } static inline u32 mon_cpu_to_le32(u32 val) { return (__force __u32)htonl(val); } static inline u64 mon_cpu_to_le64(u64 val) { return (__force __u64)cpu_to_be64(val); } static inline u16 mon_le16_to_cpu(u16 val) { return ntohs((__force __be16)val); } static inline u32 mon_le32_to_cpu(u32 val) { return ntohl((__force __be32)val); } static inline u64 mon_le64_to_cpu(u64 val) { return be64_to_cpu((__force __be64)val); } /* dom_rec_len(): actual length of domain record for transport */ static int dom_rec_len(struct tipc_mon_domain *dom, u16 mcnt) { return (offsetof(struct tipc_mon_domain, members)) + (mcnt * sizeof(u32)); } /* dom_size() : calculate size of own domain based on number of peers */ static int dom_size(int peers) { int i = 0; while ((i * i) < peers) i++; return min(i, MAX_MON_DOMAIN); } static void map_set(u64 *up_map, int i, unsigned int v) { *up_map &= ~(1ULL << i); *up_map |= ((u64)v << i); } static int map_get(u64 up_map, int i) { return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) { return list_last_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_nxt(struct tipc_peer *peer) { return list_first_entry(&peer->list, struct tipc_peer, list); } static struct tipc_peer *peer_head(struct tipc_peer *peer) { while (!peer->is_head) peer = peer_prev(peer); return peer; } static struct tipc_peer *get_peer(struct tipc_monitor *mon, u32 addr) { struct tipc_peer *peer; unsigned int thash = tipc_hashfn(addr); hlist_for_each_entry(peer, &mon->peers[thash], hash) { if (peer->addr == addr) return peer; } return NULL; } static struct tipc_peer *get_self(struct net *net, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); return mon->self; } static inline bool tipc_mon_is_active(struct net *net, struct tipc_monitor *mon) { struct tipc_net *tn = tipc_net(net); return mon->peer_cnt > tn->mon_threshold; } /* mon_identify_lost_members() : - identify amd mark potentially lost members */ static void mon_identify_lost_members(struct tipc_peer *peer, struct tipc_mon_domain *dom_bef, int applied_bef) { struct tipc_peer *member = peer; struct tipc_mon_domain *dom_aft = peer->domain; int applied_aft = peer->applied; int i; for (i = 0; i < applied_bef; i++) { member = peer_nxt(member); /* Do nothing if self or peer already see member as down */ if (!member->is_up || !map_get(dom_bef->up_map, i)) continue; /* Loss of local node must be detected by active probing */ if (member->is_local) continue; /* Start probing if member was removed from applied domain */ if (!applied_aft || (applied_aft < i)) { member->down_cnt = 1; continue; } /* Member loss is confirmed if it is still in applied domain */ if (!map_get(dom_aft->up_map, i)) member->down_cnt++; } } /* mon_apply_domain() : match a peer's domain record against monitor list */ static void mon_apply_domain(struct tipc_monitor *mon, struct tipc_peer *peer) { struct tipc_mon_domain *dom = peer->domain; struct tipc_peer *member; u32 addr; int i; if (!dom || !peer->is_up) return; /* Scan across domain members and match against monitor list */ peer->applied = 0; member = peer_nxt(peer); for (i = 0; i < dom->member_cnt; i++) { addr = dom->members[i]; if (addr != member->addr) return; peer->applied++; member = peer_nxt(member); } } /* mon_update_local_domain() : update after peer addition/removal/up/down */ static void mon_update_local_domain(struct tipc_monitor *mon) { struct tipc_peer *self = mon->self; struct tipc_mon_domain *cache = &mon->cache; struct tipc_mon_domain *dom = self->domain; struct tipc_peer *peer = self; u64 prev_up_map = dom->up_map; u16 member_cnt, i; bool diff; /* Update local domain size based on current size of cluster */ member_cnt = dom_size(mon->peer_cnt) - 1; self->applied = member_cnt; /* Update native and cached outgoing local domain records */ dom->len = dom_rec_len(dom, member_cnt); diff = dom->member_cnt != member_cnt; dom->member_cnt = member_cnt; for (i = 0; i < member_cnt; i++) { peer = peer_nxt(peer); diff |= dom->members[i] != peer->addr; dom->members[i] = peer->addr; map_set(&dom->up_map, i, peer->is_up); cache->members[i] = mon_cpu_to_le32(peer->addr); } diff |= dom->up_map != prev_up_map; if (!diff) return; dom->gen = ++mon->dom_gen; cache->len = mon_cpu_to_le16(dom->len); cache->gen = mon_cpu_to_le16(dom->gen); cache->member_cnt = mon_cpu_to_le16(member_cnt); cache->up_map = mon_cpu_to_le64(dom->up_map); mon_apply_domain(mon, self); } /* mon_update_neighbors() : update preceding neighbors of added/removed peer */ static void mon_update_neighbors(struct tipc_monitor *mon, struct tipc_peer *peer) { int dz, i; dz = dom_size(mon->peer_cnt); for (i = 0; i < dz; i++) { mon_apply_domain(mon, peer); peer = peer_prev(peer); } } /* mon_assign_roles() : reassign peer roles after a network change * The monitor list is consistent at this stage; i.e., each peer is monitoring * a set of domain members as matched between domain record and the monitor list */ static void mon_assign_roles(struct tipc_monitor *mon, struct tipc_peer *head) { struct tipc_peer *peer = peer_nxt(head); struct tipc_peer *self = mon->self; int i = 0; for (; peer != self; peer = peer_nxt(peer)) { peer->is_local = false; /* Update domain member */ if (i++ < head->applied) { peer->is_head = false; if (head == self) peer->is_local = true; continue; } /* Assign next domain head */ if (!peer->is_up) continue; if (peer->is_head) break; head = peer; head->is_head = true; i = 0; } mon->list_gen++; } void tipc_mon_remove_peer(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *prev, *head; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) goto exit; prev = peer_prev(peer); list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); mon->peer_cnt--; head = peer_head(prev); if (head == self) mon_update_local_domain(mon); mon_update_neighbors(mon, prev); /* Revert to full-mesh monitoring if we reach threshold */ if (!tipc_mon_is_active(net, mon)) { list_for_each_entry(peer, &self->list, list) { kfree(peer->domain); peer->domain = NULL; peer->applied = 0; } } mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } static bool tipc_mon_add_peer(struct tipc_monitor *mon, u32 addr, struct tipc_peer **peer) { struct tipc_peer *self = mon->self; struct tipc_peer *cur, *prev, *p; p = kzalloc(sizeof(*p), GFP_ATOMIC); *peer = p; if (!p) return false; p->addr = addr; /* Add new peer to lookup list */ INIT_LIST_HEAD(&p->list); hlist_add_head(&p->hash, &mon->peers[tipc_hashfn(addr)]); /* Sort new peer into iterator list, in ascending circular order */ prev = self; list_for_each_entry(cur, &self->list, list) { if ((addr > prev->addr) && (addr < cur->addr)) break; if (((addr < cur->addr) || (addr > prev->addr)) && (prev->addr > cur->addr)) break; prev = cur; } list_add_tail(&p->list, &cur->list); mon->peer_cnt++; mon_update_neighbors(mon, p); return true; } void tipc_mon_peer_up(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self = get_self(net, bearer_id); struct tipc_peer *peer, *head; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer && !tipc_mon_add_peer(mon, addr, &peer)) goto exit; peer->is_up = true; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } void tipc_mon_peer_down(struct net *net, u32 addr, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *head; struct tipc_mon_domain *dom; int applied; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer) { pr_warn("Mon: unknown link %x/%u DOWN\n", addr, bearer_id); goto exit; } applied = peer->applied; peer->applied = 0; dom = peer->domain; peer->domain = NULL; if (peer->is_head) mon_identify_lost_members(peer, dom, applied); kfree(dom); peer->is_up = false; peer->is_head = false; peer->is_local = false; peer->down_cnt = 0; head = peer_head(peer); if (head == self) mon_update_local_domain(mon); mon_assign_roles(mon, head); exit: write_unlock_bh(&mon->lock); } /* tipc_mon_rcv - process monitor domain event message */ void tipc_mon_rcv(struct net *net, void *data, u16 dlen, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *arrv_dom = data; struct tipc_mon_domain dom_bef; struct tipc_mon_domain *dom; struct tipc_peer *peer; u16 new_member_cnt = mon_le16_to_cpu(arrv_dom->member_cnt); int new_dlen = dom_rec_len(arrv_dom, new_member_cnt); u16 new_gen = mon_le16_to_cpu(arrv_dom->gen); u16 acked_gen = mon_le16_to_cpu(arrv_dom->ack_gen); u16 arrv_dlen = mon_le16_to_cpu(arrv_dom->len); bool probing = state->probing; int i, applied_bef; state->probing = false; /* Sanity check received domain record */ if (new_member_cnt > MAX_MON_DOMAIN) return; if (dlen < dom_rec_len(arrv_dom, 0)) return; if (dlen != dom_rec_len(arrv_dom, new_member_cnt)) return; if (dlen < new_dlen || arrv_dlen != new_dlen) return; /* Synch generation numbers with peer if link just came up */ if (!state->synched) { state->peer_gen = new_gen - 1; state->acked_gen = acked_gen; state->synched = true; } if (more(acked_gen, state->acked_gen)) state->acked_gen = acked_gen; /* Drop duplicate unless we are waiting for a probe response */ if (!more(new_gen, state->peer_gen) && !probing) return; write_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (!peer || !peer->is_up) goto exit; /* Peer is confirmed, stop any ongoing probing */ peer->down_cnt = 0; /* Task is done for duplicate record */ if (!more(new_gen, state->peer_gen)) goto exit; state->peer_gen = new_gen; /* Cache current domain record for later use */ dom_bef.member_cnt = 0; dom = peer->domain; if (dom) memcpy(&dom_bef, dom, dom->len); /* Transform and store received domain record */ if (!dom || (dom->len < new_dlen)) { kfree(dom); dom = kmalloc(new_dlen, GFP_ATOMIC); peer->domain = dom; if (!dom) goto exit; } dom->len = new_dlen; dom->gen = new_gen; dom->member_cnt = new_member_cnt; dom->up_map = mon_le64_to_cpu(arrv_dom->up_map); for (i = 0; i < new_member_cnt; i++) dom->members[i] = mon_le32_to_cpu(arrv_dom->members[i]); /* Update peers affected by this domain record */ applied_bef = peer->applied; mon_apply_domain(mon, peer); mon_identify_lost_members(peer, &dom_bef, applied_bef); mon_assign_roles(mon, peer_head(peer)); exit: write_unlock_bh(&mon->lock); } void tipc_mon_prep(struct net *net, void *data, int *dlen, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_mon_domain *dom = data; u16 gen = mon->dom_gen; u16 len; /* Send invalid record if not active */ if (!tipc_mon_is_active(net, mon)) { dom->len = 0; return; } /* Send only a dummy record with ack if peer has acked our last sent */ if (likely(state->acked_gen == gen)) { len = dom_rec_len(dom, 0); *dlen = len; dom->len = mon_cpu_to_le16(len); dom->gen = mon_cpu_to_le16(gen); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); dom->member_cnt = 0; return; } /* Send the full record */ read_lock_bh(&mon->lock); len = mon_le16_to_cpu(mon->cache.len); *dlen = len; memcpy(data, &mon->cache, len); read_unlock_bh(&mon->lock); dom->ack_gen = mon_cpu_to_le16(state->peer_gen); } void tipc_mon_get_state(struct net *net, u32 addr, struct tipc_mon_state *state, int bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!tipc_mon_is_active(net, mon)) { state->probing = false; state->monitoring = true; return; } /* Used cached state if table has not changed */ if (!state->probing && (state->list_gen == mon->list_gen) && (state->acked_gen == mon->dom_gen)) return; read_lock_bh(&mon->lock); peer = get_peer(mon, addr); if (peer) { state->probing = state->acked_gen != mon->dom_gen; state->probing |= peer->down_cnt; state->reset |= peer->down_cnt >= MAX_PEER_DOWN_EVENTS; state->monitoring = peer->is_local; state->monitoring |= peer->is_head; state->list_gen = mon->list_gen; } read_unlock_bh(&mon->lock); } static void mon_timeout(struct timer_list *t) { struct tipc_monitor *mon = from_timer(mon, t, timer); struct tipc_peer *self; int best_member_cnt = dom_size(mon->peer_cnt) - 1; write_lock_bh(&mon->lock); self = mon->self; if (self && (best_member_cnt != self->applied)) { mon_update_local_domain(mon); mon_assign_roles(mon, self); } write_unlock_bh(&mon->lock); mod_timer(&mon->timer, jiffies + mon->timer_intv); } int tipc_mon_create(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon; struct tipc_peer *self; struct tipc_mon_domain *dom; if (tn->monitors[bearer_id]) return 0; mon = kzalloc(sizeof(*mon), GFP_ATOMIC); self = kzalloc(sizeof(*self), GFP_ATOMIC); dom = kzalloc(sizeof(*dom), GFP_ATOMIC); if (!mon || !self || !dom) { kfree(mon); kfree(self); kfree(dom); return -ENOMEM; } tn->monitors[bearer_id] = mon; rwlock_init(&mon->lock); mon->net = net; mon->peer_cnt = 1; mon->self = self; self->domain = dom; self->addr = tipc_own_addr(net); self->is_up = true; self->is_head = true; INIT_LIST_HEAD(&self->list); timer_setup(&mon->timer, mon_timeout, 0); mon->timer_intv = msecs_to_jiffies(MON_TIMEOUT + (tn->random & 0xffff)); mod_timer(&mon->timer, jiffies + mon->timer_intv); return 0; } void tipc_mon_delete(struct net *net, int bearer_id) { struct tipc_net *tn = tipc_net(net); struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *self; struct tipc_peer *peer, *tmp; if (!mon) return; self = get_self(net, bearer_id); write_lock_bh(&mon->lock); tn->monitors[bearer_id] = NULL; list_for_each_entry_safe(peer, tmp, &self->list, list) { list_del(&peer->list); hlist_del(&peer->hash); kfree(peer->domain); kfree(peer); } mon->self = NULL; write_unlock_bh(&mon->lock); timer_shutdown_sync(&mon->timer); kfree(self->domain); kfree(self); kfree(mon); } void tipc_mon_reinit_self(struct net *net) { struct tipc_monitor *mon; int bearer_id; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { mon = tipc_monitor(net, bearer_id); if (!mon) continue; write_lock_bh(&mon->lock); if (mon->self) mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } int tipc_nl_monitor_set_threshold(struct net *net, u32 cluster_size) { struct tipc_net *tn = tipc_net(net); if (cluster_size > TIPC_CLUSTER_SIZE) return -EINVAL; tn->mon_threshold = cluster_size; return 0; } int tipc_nl_monitor_get_threshold(struct net *net) { struct tipc_net *tn = tipc_net(net); return tn->mon_threshold; } static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg) { struct tipc_mon_domain *dom = peer->domain; struct nlattr *attrs; void *hdr; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_PEER_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON_PEER); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_ADDR, peer->addr)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_APPLIED, peer->applied)) goto attr_msg_full; if (peer->is_up) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_UP)) goto attr_msg_full; if (peer->is_local) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_LOCAL)) goto attr_msg_full; if (peer->is_head) if (nla_put_flag(msg->skb, TIPC_NLA_MON_PEER_HEAD)) goto attr_msg_full; if (dom) { if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEER_DOMGEN, dom->gen)) goto attr_msg_full; if (nla_put_u64_64bit(msg->skb, TIPC_NLA_MON_PEER_UPMAP, dom->up_map, TIPC_NLA_MON_PEER_PAD)) goto attr_msg_full; if (nla_put(msg->skb, TIPC_NLA_MON_PEER_MEMBERS, dom->member_cnt * sizeof(u32), &dom->members)) goto attr_msg_full; } nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_add_monitor_peer(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id, u32 *prev_node) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); struct tipc_peer *peer; if (!mon) return -EINVAL; read_lock_bh(&mon->lock); peer = mon->self; do { if (*prev_node) { if (peer->addr == *prev_node) *prev_node = 0; else continue; } if (__tipc_nl_add_monitor_peer(peer, msg)) { *prev_node = peer->addr; read_unlock_bh(&mon->lock); return -EMSGSIZE; } } while ((peer = peer_nxt(peer)) != mon->self); read_unlock_bh(&mon->lock); return 0; } int __tipc_nl_add_monitor(struct net *net, struct tipc_nl_msg *msg, u32 bearer_id) { struct tipc_monitor *mon = tipc_monitor(net, bearer_id); char bearer_name[TIPC_MAX_BEARER_NAME]; struct nlattr *attrs; void *hdr; int ret; ret = tipc_bearer_get_name(net, bearer_name, bearer_id); if (ret || !mon) return 0; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; read_lock_bh(&mon->lock); if (nla_put_u32(msg->skb, TIPC_NLA_MON_REF, bearer_id)) goto attr_msg_full; if (tipc_mon_is_active(net, mon)) if (nla_put_flag(msg->skb, TIPC_NLA_MON_ACTIVE)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_MON_BEARER_NAME, bearer_name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_PEERCNT, mon->peer_cnt)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_MON_LISTGEN, mon->list_gen)) goto attr_msg_full; read_unlock_bh(&mon->lock); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: read_unlock_bh(&mon->lock); nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; }
19 13 2 19 2 19 3 19 17 2 17 3 33 18 9 28 6 27 29 11 35 4 15 12 3 47 47 3 18 22 11 47 47 42 36 33 17 16 19 19 16 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2014 Fraunhofer ITWM * * Written by: * Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de> */ #include <linux/ieee802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> static int ieee802154_hdr_push_addr(u8 *buf, const struct ieee802154_addr *addr, bool omit_pan) { int pos = 0; if (addr->mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(buf + pos, &addr->pan_id, 2); pos += 2; } switch (addr->mode) { case IEEE802154_ADDR_SHORT: memcpy(buf + pos, &addr->short_addr, 2); pos += 2; break; case IEEE802154_ADDR_LONG: memcpy(buf + pos, &addr->extended_addr, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; default: return -EINVAL; } return pos; } static int ieee802154_hdr_push_sechdr(u8 *buf, const struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(buf, hdr, 1); memcpy(buf + 1, &hdr->frame_counter, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(buf + pos, &hdr->short_src, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(buf + pos, &hdr->extended_src, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } buf[pos++] = hdr->key_id; return pos; } int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr) { u8 buf[IEEE802154_MAX_HEADER_LEN]; int pos = 2; int rc; struct ieee802154_hdr_fc *fc = &hdr->fc; buf[pos++] = hdr->seq; fc->dest_addr_mode = hdr->dest.mode; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->dest, false); if (rc < 0) return -EINVAL; pos += rc; fc->source_addr_mode = hdr->source.mode; if (hdr->source.pan_id == hdr->dest.pan_id && hdr->dest.mode != IEEE802154_ADDR_NONE) fc->intra_pan = true; rc = ieee802154_hdr_push_addr(buf + pos, &hdr->source, fc->intra_pan); if (rc < 0) return -EINVAL; pos += rc; if (fc->security_enabled) { fc->version = 1; rc = ieee802154_hdr_push_sechdr(buf + pos, &hdr->sec); if (rc < 0) return -EINVAL; pos += rc; } memcpy(buf, fc, 2); memcpy(skb_push(skb, pos), buf, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_push); int ieee802154_mac_cmd_push(struct sk_buff *skb, void *f, const void *pl, unsigned int pl_len) { struct ieee802154_mac_cmd_frame *frame = f; struct ieee802154_mac_cmd_pl *mac_pl = &frame->mac_pl; struct ieee802154_hdr *mhr = &frame->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); skb_put_data(skb, pl, pl_len); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_push); int ieee802154_beacon_push(struct sk_buff *skb, struct ieee802154_beacon_frame *beacon) { struct ieee802154_beacon_hdr *mac_pl = &beacon->mac_pl; struct ieee802154_hdr *mhr = &beacon->mhr; int ret; skb_reserve(skb, sizeof(*mhr)); ret = ieee802154_hdr_push(skb, mhr); if (ret < 0) return ret; skb_reset_mac_header(skb); skb->mac_len = ret; skb_put_data(skb, mac_pl, sizeof(*mac_pl)); if (mac_pl->pend_short_addr_count || mac_pl->pend_ext_addr_count) return -EOPNOTSUPP; return 0; } EXPORT_SYMBOL_GPL(ieee802154_beacon_push); static int ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan, struct ieee802154_addr *addr) { int pos = 0; addr->mode = mode; if (mode == IEEE802154_ADDR_NONE) return 0; if (!omit_pan) { memcpy(&addr->pan_id, buf + pos, 2); pos += 2; } if (mode == IEEE802154_ADDR_SHORT) { memcpy(&addr->short_addr, buf + pos, 2); return pos + 2; } else { memcpy(&addr->extended_addr, buf + pos, IEEE802154_ADDR_LEN); return pos + IEEE802154_ADDR_LEN; } } static int ieee802154_hdr_addr_len(int mode, bool omit_pan) { int pan_len = omit_pan ? 0 : 2; switch (mode) { case IEEE802154_ADDR_NONE: return 0; case IEEE802154_ADDR_SHORT: return 2 + pan_len; case IEEE802154_ADDR_LONG: return IEEE802154_ADDR_LEN + pan_len; default: return -EINVAL; } } static int ieee802154_hdr_get_sechdr(const u8 *buf, struct ieee802154_sechdr *hdr) { int pos = 5; memcpy(hdr, buf, 1); memcpy(&hdr->frame_counter, buf + 1, 4); switch (hdr->key_id_mode) { case IEEE802154_SCF_KEY_IMPLICIT: return pos; case IEEE802154_SCF_KEY_INDEX: break; case IEEE802154_SCF_KEY_SHORT_INDEX: memcpy(&hdr->short_src, buf + pos, 4); pos += 4; break; case IEEE802154_SCF_KEY_HW_INDEX: memcpy(&hdr->extended_src, buf + pos, IEEE802154_ADDR_LEN); pos += IEEE802154_ADDR_LEN; break; } hdr->key_id = buf[pos++]; return pos; } static int ieee802154_sechdr_lengths[4] = { [IEEE802154_SCF_KEY_IMPLICIT] = 5, [IEEE802154_SCF_KEY_INDEX] = 6, [IEEE802154_SCF_KEY_SHORT_INDEX] = 10, [IEEE802154_SCF_KEY_HW_INDEX] = 14, }; static int ieee802154_hdr_sechdr_len(u8 sc) { return ieee802154_sechdr_lengths[IEEE802154_SCF_KEY_ID_MODE(sc)]; } static int ieee802154_hdr_minlen(const struct ieee802154_hdr *hdr) { int dlen, slen; dlen = ieee802154_hdr_addr_len(hdr->fc.dest_addr_mode, false); slen = ieee802154_hdr_addr_len(hdr->fc.source_addr_mode, hdr->fc.intra_pan); if (slen < 0 || dlen < 0) return -EINVAL; return 3 + dlen + slen + hdr->fc.security_enabled; } static int ieee802154_hdr_get_addrs(const u8 *buf, struct ieee802154_hdr *hdr) { int pos = 0; pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.dest_addr_mode, false, &hdr->dest); pos += ieee802154_hdr_get_addr(buf + pos, hdr->fc.source_addr_mode, hdr->fc.intra_pan, &hdr->source); if (hdr->fc.intra_pan) hdr->source.pan_id = hdr->dest.pan_id; return pos; } int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int pos = 3, rc; if (!pskb_may_pull(skb, 3)) return -EINVAL; memcpy(hdr, skb->data, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || !pskb_may_pull(skb, rc)) return -EINVAL; pos += ieee802154_hdr_get_addrs(skb->data + pos, hdr); if (hdr->fc.security_enabled) { int want = pos + ieee802154_hdr_sechdr_len(skb->data[pos]); if (!pskb_may_pull(skb, want)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(skb->data + pos, &hdr->sec); } skb_pull(skb, pos); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_pull); int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb, struct ieee802154_mac_cmd_pl *mac_pl) { if (!pskb_may_pull(skb, sizeof(*mac_pl))) return -EINVAL; memcpy(mac_pl, skb->data, sizeof(*mac_pl)); skb_pull(skb, sizeof(*mac_pl)); return 0; } EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_pl_pull); int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos = 3, rc; if (buf + 3 > skb_tail_pointer(skb)) return -EINVAL; memcpy(hdr, buf, 3); rc = ieee802154_hdr_minlen(hdr); if (rc < 0 || buf + rc > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_addrs(buf + pos, hdr); return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek_addrs); int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr) { const u8 *buf = skb_mac_header(skb); int pos; pos = ieee802154_hdr_peek_addrs(skb, hdr); if (pos < 0) return -EINVAL; if (hdr->fc.security_enabled) { u8 key_id_mode = IEEE802154_SCF_KEY_ID_MODE(*(buf + pos)); int want = pos + ieee802154_sechdr_lengths[key_id_mode]; if (buf + want > skb_tail_pointer(skb)) return -EINVAL; pos += ieee802154_hdr_get_sechdr(buf + pos, &hdr->sec); } return pos; } EXPORT_SYMBOL_GPL(ieee802154_hdr_peek); int ieee802154_max_payload(const struct ieee802154_hdr *hdr) { int hlen = ieee802154_hdr_minlen(hdr); if (hdr->fc.security_enabled) { hlen += ieee802154_sechdr_lengths[hdr->sec.key_id_mode] - 1; hlen += ieee802154_sechdr_authtag_len(&hdr->sec); } return IEEE802154_MTU - hlen - IEEE802154_MFR_SIZE; } EXPORT_SYMBOL_GPL(ieee802154_max_payload);
7 7 7 7 151 144 7 7 200 200 6 195 176 169 6 1 4 12 188 185 5 174 4 11 5 3 8 195 7 2 2 3 7 7 200 6 170 13 2 2 200 10 4 6 197 7 200 200 201 201 17 17 186 201 31 174 174 173 174 173 173 170 168 37 37 28 174 31 31 5 9 8 6 14 1 8 6 5 12 6 11 1 14 17 16 15 32 31 32 11 6 10 11 4 4 4 11 9 4 9 9 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /** * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry * @tuple: proposed NAT binding * @ignored_conntrack: our (unconfirmed) conntrack entry * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple in either direction. * * Example: * INITIATOR -> NAT/PAT -> RESPONDER * * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). * Then, later, NAT/PAT itself also connects to RESPONDER. * * This will not work if the SNAT done earlier has same IP:PORT source pair. * * Conntrack table has: * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * and new locally originating connection wants: * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * ... which would mean incoming packets cannot be distinguished between * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). * * @return: true if the proposed NAT mapping collides with an existing entry. */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_allow_clash(const struct nf_conn *ct) { return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; } /** * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry * @tuple: proposed NAT binding * @ignored_ct: our (unconfirmed) conntrack entry * * Same as nf_nat_used_tuple, but also check for rare clash in reverse * direction. Should be called only when @tuple has not been altered, i.e. * @ignored_conntrack will not be subject to NAT. * * @return: true if the proposed NAT mapping collides with existing entry. */ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; bool taken = true; struct net *net; if (!nf_nat_used_tuple(tuple, ignored_ct)) return false; if (!nf_nat_allow_clash(ignored_ct)) return true; /* Initial choice clashes with existing conntrack. * Check for (rare) reverse collision. * * This can happen when new packets are received in both directions * at the exact same time on different CPUs. * * Without SMP, first packet creates new conntrack entry and second * packet is resolved as established reply packet. * * With parallel processing, both packets could be picked up as * new and both get their own ct entry allocated. * * If ignored_conntrack and colliding ct are not subject to NAT then * pretend the tuple is available and let later clash resolution * handle this at insertion time. * * Without it, the 'reply' packet has its source port rewritten * by nat engine. */ if (READ_ONCE(ignored_ct->status) & uses_nat) return true; net = nf_ct_net(ignored_ct); zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); if (unlikely(!thash)) /* clashing entry went away */ return false; ct = nf_ct_tuplehash_to_ctrack(thash); /* NB: IP_CT_DIR_ORIGINAL should be impossible because * nf_nat_used_tuple() handles origin collisions. * * Handle remote chance other CPU confirmed its ct right after. */ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) goto out; /* clashing connection subject to NAT? Retry with new tuple. */ if (READ_ONCE(ct->status) & uses_nat) goto out; if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { taken = false; goto out; } out: nf_ct_put(ct); return taken; } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); else off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], range->min_addr.ip); return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
665 2674 611 834 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. */ struct shared_policy { struct rb_root root; rwlock_t lock; }; struct sp_node { struct rb_node nd; pgoff_t start, end; struct mempolicy *policy; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long addr); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return (pol->mode == MPOL_PREFERRED_MANY); } extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else struct mempolicy {}; static inline struct mempolicy *get_task_policy(struct task_struct *p) { return NULL; } static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *pol) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *sp) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr, int order, pgoff_t *ilx) { *ilx = 0; return NULL; } static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct folio *folio, struct vm_fault *vmf, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; } #endif /* CONFIG_NUMA */ #endif
4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_GACT_H #define __NET_TC_GACT_H #include <net/act_api.h> #include <linux/tc_act/tc_gact.h> struct tcf_gact { struct tc_action common; #ifdef CONFIG_GACT_PROB u16 tcfg_ptype; u16 tcfg_pval; int tcfg_paction; atomic_t packets; #endif }; #define to_gact(a) ((struct tcf_gact *)a) static inline bool __is_tcf_gact_act(const struct tc_action *a, int act, bool is_ext) { #ifdef CONFIG_NET_CLS_ACT struct tcf_gact *gact; if (a->ops && a->ops->id != TCA_ID_GACT) return false; gact = to_gact(a); if ((!is_ext && gact->tcf_action == act) || (is_ext && TC_ACT_EXT_CMP(gact->tcf_action, act))) return true; #endif return false; } static inline bool is_tcf_gact_ok(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_OK, false); } static inline bool is_tcf_gact_shot(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_SHOT, false); } static inline bool is_tcf_gact_trap(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_TRAP, false); } static inline bool is_tcf_gact_goto_chain(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_GOTO_CHAIN, true); } static inline u32 tcf_gact_goto_chain_index(const struct tc_action *a) { return READ_ONCE(a->tcfa_action) & TC_ACT_EXT_VAL_MASK; } static inline bool is_tcf_gact_continue(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_UNSPEC, false); } static inline bool is_tcf_gact_reclassify(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_RECLASSIFY, false); } static inline bool is_tcf_gact_pipe(const struct tc_action *a) { return __is_tcf_gact_act(a, TC_ACT_PIPE, false); } #endif /* __NET_TC_GACT_H */
1 1 3 3 1 1 2 1 3 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-or-later /* * PCBC: Propagating Cipher Block Chaining mode * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Derived from cbc.c * - Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> static int crypto_pcbc_encrypt_segment(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); const u8 *src = walk->src.virt.addr; unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; do { crypto_xor(iv, src, bsize); crypto_cipher_encrypt_one(tfm, dst, iv); crypto_xor_cpy(iv, dst, src, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_encrypt_inplace(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; u8 tmpbuf[MAX_CIPHER_BLOCKSIZE]; do { memcpy(tmpbuf, dst, bsize); crypto_xor(iv, dst, bsize); crypto_cipher_encrypt_one(tfm, dst, iv); crypto_xor_cpy(iv, tmpbuf, dst, bsize); dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_pcbc_encrypt_inplace(req, &walk, cipher); else nbytes = crypto_pcbc_encrypt_segment(req, &walk, cipher); err = skcipher_walk_done(&walk, nbytes); } return err; } static int crypto_pcbc_decrypt_segment(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); const u8 *src = walk->src.virt.addr; unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; do { crypto_cipher_decrypt_one(tfm, dst, src); crypto_xor(dst, iv, bsize); crypto_xor_cpy(iv, dst, src, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; u8 tmpbuf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(u32)); do { memcpy(tmpbuf, dst, bsize); crypto_cipher_decrypt_one(tfm, dst, dst); crypto_xor(dst, iv, bsize); crypto_xor_cpy(iv, dst, tmpbuf, bsize); dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_pcbc_decrypt_inplace(req, &walk, cipher); else nbytes = crypto_pcbc_decrypt_segment(req, &walk, cipher); err = skcipher_walk_done(&walk, nbytes); } return err; } static int crypto_pcbc_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); inst->alg.encrypt = crypto_pcbc_encrypt; inst->alg.decrypt = crypto_pcbc_decrypt; err = skcipher_register_instance(tmpl, inst); if (err) inst->free(inst); return err; } static struct crypto_template crypto_pcbc_tmpl = { .name = "pcbc", .create = crypto_pcbc_create, .module = THIS_MODULE, }; static int __init crypto_pcbc_module_init(void) { return crypto_register_template(&crypto_pcbc_tmpl); } static void __exit crypto_pcbc_module_exit(void) { crypto_unregister_template(&crypto_pcbc_tmpl); } subsys_initcall(crypto_pcbc_module_init); module_exit(crypto_pcbc_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PCBC block cipher mode of operation"); MODULE_ALIAS_CRYPTO("pcbc"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
1315 1840 1897 1852 555 326 525 324 4 323 111 118 198 1840 493 1838 598 598 597 579 236 286 587 42 456 426 598 406 599 628 630 599 118 253 236 2 237 158 22 155 76 283 190 262 27 60 231 10 225 914 893 893 52 1932 1930 893 894 1620 1619 1319 1321 1320 1315 9 2 8 81 81 138 37 105 233 234 157 157 186 186 469 469 1620 5 2 5 1092 1088 1090 2 2 8 572 574 576 2 2 234 234 158 158 186 186 469 469 1621 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/dev_addr_lists.c - Functions for handling net device lists * Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com> * * This file contains functions for working with unicast, multicast and device * addresses lists. */ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/export.h> #include <linux/list.h> #include "dev.h" /* * General list handling functions */ static int __hw_addr_insert(struct netdev_hw_addr_list *list, struct netdev_hw_addr *new, int addr_len) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(new->addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&new->type, &ha->type, sizeof(new->type)); parent = *ins_point; if (diff < 0) ins_point = &parent->rb_left; else if (diff > 0) ins_point = &parent->rb_right; else return -EEXIST; } rb_link_node_rcu(&new->node, parent, ins_point); rb_insert_color(&new->node, &list->tree); return 0; } static struct netdev_hw_addr* __hw_addr_create(const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; alloc_size = sizeof(*ha); if (alloc_size < L1_CACHE_BYTES) alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, int sync_count, bool exclusive) { struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; while (*ins_point) { int diff; ha = rb_entry(*ins_point, struct netdev_hw_addr, node); diff = memcmp(addr, ha->addr, addr_len); if (diff == 0) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); parent = *ins_point; if (diff < 0) { ins_point = &parent->rb_left; } else if (diff > 0) { ins_point = &parent->rb_right; } else { if (exclusive) return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) return 0; else ha->global_use = true; } if (sync) { if (ha->synced && sync_count) return -EEXIST; else ha->synced++; } ha->refcount++; return 0; } } ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); if (!ha) return -ENOMEM; rb_link_node(&ha->node, parent, ins_point); rb_insert_color(&ha->node, &list->tree); list_add_tail_rcu(&ha->list, &list->list); list->count++; return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, struct netdev_hw_addr *ha, bool global, bool sync) { if (global && !ha->global_use) return -ENOENT; if (sync && !ha->synced) return -ENOENT; if (global) ha->global_use = false; if (sync) ha->synced--; if (--ha->refcount) return 0; rb_erase(&ha->node, &list->tree); list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { struct rb_node *node; node = list->tree.rb_node; while (node) { struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); int diff = memcmp(addr, ha->addr, addr_len); if (diff == 0 && addr_type) diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); if (diff < 0) node = node->rb_left; else if (diff > 0) node = node->rb_right; else return ha; } return NULL; } static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); if (!ha) return -ENOENT; return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type) { return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false); } static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; if (!err) { ha->sync_cnt++; ha->refcount++; } return 0; } static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, struct netdev_hw_addr *ha, int addr_len) { int err; err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type, false, true); if (err) return; ha->sync_cnt--; /* address on from list is not marked synced */ __hw_addr_del_entry(from_list, ha, false, false); } int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt == ha->refcount) { __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } else { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } } return err; } EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship * between source and destination of they synch. If you ever need to * sync addresses to more then 1 destination, you need to use * __hw_addr_sync_multiple(). */ int __hw_addr_sync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (!ha->sync_cnt) { err = __hw_addr_sync_one(to_list, ha, addr_len); if (err) break; } else if (ha->refcount == 1) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } return err; } EXPORT_SYMBOL(__hw_addr_sync); void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, struct netdev_hw_addr_list *from_list, int addr_len) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &from_list->list, list) { if (ha->sync_cnt) __hw_addr_unsync_one(to_list, from_list, ha, addr_len); } } EXPORT_SYMBOL(__hw_addr_unsync); /** * __hw_addr_sync_dev - Synchronize device's multicast list * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without * any notification to the device. **/ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *), int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; int err; /* first go through and flush out any stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt || ha->refcount != 1) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { if (ha->sync_cnt) continue; err = sync(dev, ha->addr); if (err) return err; ha->sync_cnt++; ha->refcount++; } return 0; } EXPORT_SYMBOL(__hw_addr_sync_dev); /** * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking * into account references * @list: address list to synchronize * @dev: device to sync * @sync: function to call if address or reference on it should be added * @unsync: function to call if address or some reference on it should removed * * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address or references on it * add/remove notifications. The unsync function may be NULL in which case * the addresses or references on it requiring removal will simply be * removed without any notification to the device. That is responsibility of * the driver to identify and distribute address or references on it between * internal address tables. **/ int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*sync)(struct net_device *, const unsigned char *, int), int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; int err, ref_cnt; /* first go through and flush out any unsynced/stale entries */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address is not used */ if ((ha->sync_cnt << 1) <= ha->refcount) continue; /* if fails defer unsyncing address */ ref_cnt = ha->refcount - ha->sync_cnt; if (unsync && unsync(dev, ha->addr, ref_cnt)) continue; ha->refcount = (ref_cnt << 1) + 1; ha->sync_cnt = ref_cnt; __hw_addr_del_entry(list, ha, false, false); } /* go through and sync updated/new entries to the list */ list_for_each_entry_safe(ha, tmp, &list->list, list) { /* sync if address added or reused */ if ((ha->sync_cnt << 1) >= ha->refcount) continue; ref_cnt = ha->refcount - ha->sync_cnt; err = sync(dev, ha->addr, ref_cnt); if (err) return err; ha->refcount = ref_cnt << 1; ha->sync_cnt = ref_cnt; } return 0; } EXPORT_SYMBOL(__hw_addr_ref_sync_dev); /** * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on * it from device * @list: address list to remove synchronized addresses (references on it) from * @dev: device to sync * @unsync: function to call if address and references on it should be removed * * Remove all addresses that were added to the device by * __hw_addr_ref_sync_dev(). This function is intended to be called from the * ndo_stop or ndo_open functions on devices that require explicit address (or * references on it) add/remove notifications. If the unsync function pointer * is NULL then this function can be used to just reset the sync_cnt for the * addresses in the list. **/ void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *, int)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr, ha->sync_cnt)) continue; ha->refcount -= ha->sync_cnt - 1; ha->sync_cnt = 0; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_ref_unsync_dev); /** * __hw_addr_unsync_dev - Remove synchronized addresses from device * @list: address list to remove synchronized addresses from * @dev: device to sync * @unsync: function to call if address should be removed * * Remove all addresses that were added to the device by __hw_addr_sync_dev(). * This function is intended to be called from the ndo_stop or ndo_open * functions on devices that require explicit address add/remove * notifications. If the unsync function pointer is NULL then this function * can be used to just reset the sync_cnt for the addresses in the list. **/ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, struct net_device *dev, int (*unsync)(struct net_device *, const unsigned char *)) { struct netdev_hw_addr *ha, *tmp; list_for_each_entry_safe(ha, tmp, &list->list, list) { if (!ha->sync_cnt) continue; /* if unsync is defined and fails defer unsyncing address */ if (unsync && unsync(dev, ha->addr)) continue; ha->sync_cnt--; __hw_addr_del_entry(list, ha, false, false); } } EXPORT_SYMBOL(__hw_addr_unsync_dev); static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); } list->count = 0; } void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); /* * Device addresses handling functions */ /* Check that netdev->dev_addr is not written to directly as this would * break the rbtree layout. All changes should go thru dev_addr_set() and co. * Remove this check in mid-2024. */ void dev_addr_check(struct net_device *dev) { if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN)) return; netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr); netdev_warn(dev, "Expected addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr_shadow); netdev_WARN(dev, "Incorrect netdev->dev_addr\n"); } /** * dev_addr_flush - Flush device address list * @dev: device * * Flush device address list and reset ->dev_addr. * * The caller must hold the rtnl_mutex. */ void dev_addr_flush(struct net_device *dev) { /* rtnl_mutex must be held here */ dev_addr_check(dev); __hw_addr_flush(&dev->dev_addrs); dev->dev_addr = NULL; } /** * dev_addr_init - Init device address list * @dev: device * * Init device address list and create the first element, * used by ->dev_addr. * * The caller must hold the rtnl_mutex. */ int dev_addr_init(struct net_device *dev) { unsigned char addr[MAX_ADDR_LEN]; struct netdev_hw_addr *ha; int err; /* rtnl_mutex must be held here */ __hw_addr_init(&dev->dev_addrs); memset(addr, 0, sizeof(addr)); err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* * Get the first (previously created) address from the list * and set dev_addr pointer to this location. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); dev->dev_addr = ha->addr; } return err; } void dev_addr_mod(struct net_device *dev, unsigned int offset, const void *addr, size_t len) { struct netdev_hw_addr *ha; dev_addr_check(dev); ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]); rb_erase(&ha->node, &dev->dev_addrs.tree); memcpy(&ha->addr[offset], addr, len); memcpy(&dev->dev_addr_shadow[offset], addr, len); WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len)); } EXPORT_SYMBOL(dev_addr_mod); /** * dev_addr_add - Add a device address * @dev: device * @addr: address to add * @addr_type: address type * * Add a device address to the device or increase the reference count if * it already exists. * * The caller must hold the rtnl_mutex. */ int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; ASSERT_RTNL(); err = dev_pre_changeaddr_notify(dev, addr, NULL); if (err) return err; err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_add); /** * dev_addr_del - Release a device address. * @dev: device * @addr: address to delete * @addr_type: address type * * Release reference to a device address and remove it from the device * if the reference count drops to zero. * * The caller must hold the rtnl_mutex. */ int dev_addr_del(struct net_device *dev, const unsigned char *addr, unsigned char addr_type) { int err; struct netdev_hw_addr *ha; ASSERT_RTNL(); /* * We can not remove the first address from the list because * dev->dev_addr points to that. */ ha = list_first_entry(&dev->dev_addrs.list, struct netdev_hw_addr, list); if (!memcmp(ha->addr, addr, dev->addr_len) && ha->type == addr_type && ha->refcount == 1) return -ENOENT; err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; } EXPORT_SYMBOL(dev_addr_del); /* * Unicast list handling functions */ /** * dev_uc_add_excl - Add a global secondary unicast address * @dev: device * @addr: address to add */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add_excl); /** * dev_uc_add - Add a secondary unicast address * @dev: device * @addr: address to add * * Add a secondary unicast address to the device or increase * the reference count if it already exists. */ int dev_uc_add(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_add); /** * dev_uc_del - Release secondary unicast address. * @dev: device * @addr: address to delete * * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. */ int dev_uc_del(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_uc_del); /** * dev_uc_sync - Synchronize device's unicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. This function assumes that * addresses will only ever be synced to the @to devices and no other. */ int dev_uc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync); /** * dev_uc_sync_multiple - Synchronize device's unicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have been deleted from the source. The source device * must be locked by netif_addr_lock_bh. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. It allows for a single source * device to be synced to multiple destination devices. */ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_uc_sync_multiple); /** * dev_uc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_uc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_uc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two * reasons: * 1) This is always called without any addr_list_lock, so as the * outermost one here, it must be 0. * 2) This is called by some callers after unlinking the upper device, * so the dev->lower_level becomes 1 again. * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or * larger. */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_uc_unsync); /** * dev_uc_flush - Flush unicast addresses * @dev: device * * Flush unicast addresses. */ void dev_uc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->uc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_uc_flush); /** * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. */ void dev_uc_init(struct net_device *dev) { __hw_addr_init(&dev->uc); } EXPORT_SYMBOL(dev_uc_init); /* * Multicast list handling functions */ /** * dev_mc_add_excl - Add a global secondary multicast address * @dev: device * @addr: address to add */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, true, false, 0, true); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_mc_add_excl); static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false, 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_add - Add a multicast address * @dev: device * @addr: address to add * * Add a multicast address to the device or increase * the reference count if it already exists. */ int dev_mc_add(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, false); } EXPORT_SYMBOL(dev_mc_add); /** * dev_mc_add_global - Add a global multicast address * @dev: device * @addr: address to add * * Add a global multicast address to the device. */ int dev_mc_add_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_add(dev, addr, true); } EXPORT_SYMBOL(dev_mc_add_global); static int __dev_mc_del(struct net_device *dev, const unsigned char *addr, bool global) { int err; netif_addr_lock_bh(dev); err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len, NETDEV_HW_ADDR_T_MULTICAST, global, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); return err; } /** * dev_mc_del - Delete a multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, false); } EXPORT_SYMBOL(dev_mc_del); /** * dev_mc_del_global - Delete a global multicast address. * @dev: device * @addr: address to delete * * Release reference to a multicast address and remove it * from the device if the reference count drops to zero. */ int dev_mc_del_global(struct net_device *dev, const unsigned char *addr) { return __dev_mc_del(dev, addr, true); } EXPORT_SYMBOL(dev_mc_del_global); /** * dev_mc_sync - Synchronize device's multicast list to another device * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. */ int dev_mc_sync(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync); /** * dev_mc_sync_multiple - Synchronize device's multicast list to another * device, but allow for multiple calls to sync to multiple devices. * @to: destination device * @from: source device * * Add newly added addresses to the destination device and release * addresses that have no users left. The source device must be * locked by netif_addr_lock_bh. * * This function is intended to be called from the ndo_set_rx_mode * function of layered software devices. It allows for a single * source device to be synced to multiple destination devices. */ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from) { int err = 0; if (to->addr_len != from->addr_len) return -EINVAL; netif_addr_lock(to); err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len); if (!err) __dev_set_rx_mode(to); netif_addr_unlock(to); return err; } EXPORT_SYMBOL(dev_mc_sync_multiple); /** * dev_mc_unsync - Remove synchronized addresses from the destination device * @to: destination device * @from: source device * * Remove all addresses that were added to the destination device by * dev_mc_sync(). This function is intended to be called from the * dev->stop function of layered software devices. */ void dev_mc_unsync(struct net_device *to, struct net_device *from) { if (to->addr_len != from->addr_len) return; /* See the above comments inside dev_uc_unsync(). */ netif_addr_lock_bh(from); netif_addr_lock(to); __hw_addr_unsync(&to->mc, &from->mc, to->addr_len); __dev_set_rx_mode(to); netif_addr_unlock(to); netif_addr_unlock_bh(from); } EXPORT_SYMBOL(dev_mc_unsync); /** * dev_mc_flush - Flush multicast addresses * @dev: device * * Flush multicast addresses. */ void dev_mc_flush(struct net_device *dev) { netif_addr_lock_bh(dev); __hw_addr_flush(&dev->mc); netif_addr_unlock_bh(dev); } EXPORT_SYMBOL(dev_mc_flush); /** * dev_mc_init - Init multicast address list * @dev: device * * Init multicast address list. */ void dev_mc_init(struct net_device *dev) { __hw_addr_init(&dev->mc); } EXPORT_SYMBOL(dev_mc_init);
2 4 4 12 9 3 4 4 1 1 1 1 3 1 2 3 3 3 3 1 1 1 1 1 1 13 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H #include <linux/rcupdate_wait.h> #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) #define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) #define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) #define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) #define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) #define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) #define mtype_memsize IPSET_TOKEN(MTYPE, _memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_add IPSET_TOKEN(MTYPE, _add) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } static void mtype_ext_cleanup(struct ip_set *set) { struct mtype *map = set->data; u32 id; for (id = 0; id < map->elements; id++) if (test_bit(id, map->members)) ip_set_ext_destroy(set, get_ext(set, map, id)); } static void mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); ip_set_free(map); set->data = NULL; } static void mtype_flush(struct ip_set *set) { struct mtype *map = set->data; if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } /* Calculate the actual memory size of the set data */ static size_t mtype_memsize(const struct mtype *map, size_t dsize) { return sizeof(*map) + map->memsize + map->elements * dsize; } static int mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_test(e, map, set->dsize); if (ret <= 0) return ret; return ip_set_match_extensions(set, ext, mext, flags, x); } static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_add(e, map, flags, set->dsize); if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) { set->elements--; ret = 0; } else if (!(flags & IPSET_FLAG_EXIST)) { set_bit(e->id, map->members); return -IPSET_ERR_EXIST; } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } if (ret > 0) set->elements--; if (SET_WITH_TIMEOUT(set)) #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); #else ip_set_timeout_set(ext_timeout(x, set), ext->timeout); #endif if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); /* Activate element */ set_bit(e->id, map->members); set->elements++; return 0; } static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); if (mtype_do_del(e, map)) return -IPSET_ERR_EXIST; ip_set_ext_destroy(set, x); set->elements--; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) return -IPSET_ERR_EXIST; return 0; } #ifndef IP_SET_BITMAP_STORED_TIMEOUT static bool mtype_is_filled(const struct mtype_elem *x) { return true; } #endif static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { struct mtype *map = set->data; struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; int ret = 0; adt = nla_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; /* Extensions may be replaced */ rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { cond_resched_rcu(); id = cb->args[IPSET_CB_ARG0]; x = get_ext(set, map, id); if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_is_filled(x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); ret = -EMSGSIZE; goto out; } goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; nla_nest_end(skb, nested); } nla_nest_end(skb, adt); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } nla_nest_end(skb, adt); out: rcu_read_unlock(); return ret; } static void mtype_gc(struct timer_list *t) { struct mtype *map = from_timer(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; /* We run parallel with other readers (test element) * but adding/deleting new entries is locked out */ spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); if (ip_set_timeout_expired(ext_timeout(x, set))) { clear_bit(id, map->members); ip_set_ext_destroy(set, x); set->elements--; } } spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void mtype_cancel_gc(struct ip_set *set) { struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) timer_delete_sync(&map->gc); } static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, .adt = { [IPSET_ADD] = mtype_add, [IPSET_DEL] = mtype_del, [IPSET_TEST] = mtype_test, }, .destroy = mtype_destroy, .flush = mtype_flush, .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Ruleset management * * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_RULESET_H #define _SECURITY_LANDLOCK_RULESET_H #include <linux/cleanup.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include "access.h" #include "limits.h" #include "object.h" struct landlock_hierarchy; /** * struct landlock_layer - Access rights for a given layer */ struct landlock_layer { /** * @level: Position of this layer in the layer stack. */ u16 level; /** * @access: Bitfield of allowed actions on the kernel object. They are * relative to the object type (e.g. %LANDLOCK_ACTION_FS_READ). */ access_mask_t access; }; /** * union landlock_key - Key of a ruleset's red-black tree */ union landlock_key { /** * @object: Pointer to identify a kernel object (e.g. an inode). */ struct landlock_object *object; /** * @data: Raw data to identify an arbitrary 32-bit value * (e.g. a TCP port). */ uintptr_t data; }; /** * enum landlock_key_type - Type of &union landlock_key */ enum landlock_key_type { /** * @LANDLOCK_KEY_INODE: Type of &landlock_ruleset.root_inode's node * keys. */ LANDLOCK_KEY_INODE = 1, /** * @LANDLOCK_KEY_NET_PORT: Type of &landlock_ruleset.root_net_port's * node keys. */ LANDLOCK_KEY_NET_PORT, }; /** * struct landlock_id - Unique rule identifier for a ruleset */ struct landlock_id { /** * @key: Identifies either a kernel object (e.g. an inode) or * a raw value (e.g. a TCP port). */ union landlock_key key; /** * @type: Type of a landlock_ruleset's root tree. */ const enum landlock_key_type type; }; /** * struct landlock_rule - Access rights tied to an object */ struct landlock_rule { /** * @node: Node in the ruleset's red-black tree. */ struct rb_node node; /** * @key: A union to identify either a kernel object (e.g. an inode) or * a raw data value (e.g. a network socket port). This is used as a key * for this ruleset element. The pointer is set once and never * modified. It always points to an allocated object because each rule * increments the refcount of its object. */ union landlock_key key; /** * @num_layers: Number of entries in @layers. */ u32 num_layers; /** * @layers: Stack of layers, from the latest to the newest, implemented * as a flexible array member (FAM). */ struct landlock_layer layers[] __counted_by(num_layers); }; /** * struct landlock_ruleset - Landlock ruleset * * This data structure must contain unique entries, be updatable, and quick to * match an object. */ struct landlock_ruleset { /** * @root_inode: Root of a red-black tree containing &struct * landlock_rule nodes with inode object. Once a ruleset is tied to a * process (i.e. as a domain), this tree is immutable until @usage * reaches zero. */ struct rb_root root_inode; #if IS_ENABLED(CONFIG_INET) /** * @root_net_port: Root of a red-black tree containing &struct * landlock_rule nodes with network port. Once a ruleset is tied to a * process (i.e. as a domain), this tree is immutable until @usage * reaches zero. */ struct rb_root root_net_port; #endif /* IS_ENABLED(CONFIG_INET) */ /** * @hierarchy: Enables hierarchy identification even when a parent * domain vanishes. This is needed for the ptrace protection. */ struct landlock_hierarchy *hierarchy; union { /** * @work_free: Enables to free a ruleset within a lockless * section. This is only used by * landlock_put_ruleset_deferred() when @usage reaches zero. * The fields @lock, @usage, @num_rules, @num_layers and * @access_masks are then unused. */ struct work_struct work_free; struct { /** * @lock: Protects against concurrent modifications of * @root, if @usage is greater than zero. */ struct mutex lock; /** * @usage: Number of processes (i.e. domains) or file * descriptors referencing this ruleset. */ refcount_t usage; /** * @num_rules: Number of non-overlapping (i.e. not for * the same object) rules in this ruleset. */ u32 num_rules; /** * @num_layers: Number of layers that are used in this * ruleset. This enables to check that all the layers * allow an access request. A value of 0 identifies a * non-merged ruleset (i.e. not a domain). */ u32 num_layers; /** * @access_masks: Contains the subset of filesystem and * network actions that are restricted by a ruleset. * A domain saves all layers of merged rulesets in a * stack (FAM), starting from the first layer to the * last one. These layers are used when merging * rulesets, for user space backward compatibility * (i.e. future-proof), and to properly handle merged * rulesets without overlapping access rights. These * layers are set once and never changed for the * lifetime of the ruleset. */ struct access_masks access_masks[]; }; }; }; struct landlock_ruleset * landlock_create_ruleset(const access_mask_t access_mask_fs, const access_mask_t access_mask_net, const access_mask_t scope_mask); void landlock_put_ruleset(struct landlock_ruleset *const ruleset); void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset); DEFINE_FREE(landlock_put_ruleset, struct landlock_ruleset *, if (!IS_ERR_OR_NULL(_T)) landlock_put_ruleset(_T)) int landlock_insert_rule(struct landlock_ruleset *const ruleset, const struct landlock_id id, const access_mask_t access); struct landlock_ruleset * landlock_merge_ruleset(struct landlock_ruleset *const parent, struct landlock_ruleset *const ruleset); const struct landlock_rule * landlock_find_rule(const struct landlock_ruleset *const ruleset, const struct landlock_id id); static inline void landlock_get_ruleset(struct landlock_ruleset *const ruleset) { if (ruleset) refcount_inc(&ruleset->usage); } /** * landlock_union_access_masks - Return all access rights handled in the * domain * * @domain: Landlock ruleset (used as a domain) * * Returns: an access_masks result of the OR of all the domain's access masks. */ static inline struct access_masks landlock_union_access_masks(const struct landlock_ruleset *const domain) { union access_masks_all matches = {}; size_t layer_level; for (layer_level = 0; layer_level < domain->num_layers; layer_level++) { union access_masks_all layer = { .masks = domain->access_masks[layer_level], }; matches.all |= layer.all; } return matches.masks; } static inline void landlock_add_fs_access_mask(struct landlock_ruleset *const ruleset, const access_mask_t fs_access_mask, const u16 layer_level) { access_mask_t fs_mask = fs_access_mask & LANDLOCK_MASK_ACCESS_FS; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(fs_access_mask != fs_mask); ruleset->access_masks[layer_level].fs |= fs_mask; } static inline void landlock_add_net_access_mask(struct landlock_ruleset *const ruleset, const access_mask_t net_access_mask, const u16 layer_level) { access_mask_t net_mask = net_access_mask & LANDLOCK_MASK_ACCESS_NET; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(net_access_mask != net_mask); ruleset->access_masks[layer_level].net |= net_mask; } static inline void landlock_add_scope_mask(struct landlock_ruleset *const ruleset, const access_mask_t scope_mask, const u16 layer_level) { access_mask_t mask = scope_mask & LANDLOCK_MASK_SCOPE; /* Should already be checked in sys_landlock_create_ruleset(). */ WARN_ON_ONCE(scope_mask != mask); ruleset->access_masks[layer_level].scope |= mask; } static inline access_mask_t landlock_get_fs_access_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { /* Handles all initially denied by default access rights. */ return ruleset->access_masks[layer_level].fs | _LANDLOCK_ACCESS_FS_INITIALLY_DENIED; } static inline access_mask_t landlock_get_net_access_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { return ruleset->access_masks[layer_level].net; } static inline access_mask_t landlock_get_scope_mask(const struct landlock_ruleset *const ruleset, const u16 layer_level) { return ruleset->access_masks[layer_level].scope; } bool landlock_unmask_layers(const struct landlock_rule *const rule, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const size_t masks_array_size); access_mask_t landlock_init_layer_masks(const struct landlock_ruleset *const domain, const access_mask_t access_request, layer_mask_t (*const layer_masks)[], const enum landlock_key_type key_type); #endif /* _SECURITY_LANDLOCK_RULESET_H */
5 5 5 5 5 110 109 1 9 7 2 9 13 1 4 4 4 2 2 4 17 17 26 62 61 2 63 122 123 123 97 287 282 5 1 1 122 123 234 214 26 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/tcp.h> #include <linux/rcupdate.h> #include <net/tcp.h> void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; struct tcp_fastopen_context *ctxt; rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) { rcu_read_unlock(); return; } rcu_read_unlock(); /* tcp_fastopen_reset_cipher publishes the new context * atomically, so we allow this race happening here. * * All call sites of tcp_fastopen_cookie_gen also check * for a valid cookie, so this is an acceptable risk. */ get_random_bytes(key, sizeof(key)); tcp_fastopen_reset_cipher(net, NULL, key, NULL); } static void tcp_fastopen_ctx_free(struct rcu_head *head) { struct tcp_fastopen_context *ctx = container_of(head, struct tcp_fastopen_context, rcu); kfree_sensitive(ctx); } void tcp_fastopen_destroy_cipher(struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference_protected( inet_csk(sk)->icsk_accept_queue.fastopenq.ctx, 1); if (ctx) call_rcu(&ctx->rcu, tcp_fastopen_ctx_free); } void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL)); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); } int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key) { struct tcp_fastopen_context *ctx, *octx; struct fastopen_queue *q; int err = 0; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto out; } ctx->key[0].key[0] = get_unaligned_le64(primary_key); ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8); if (backup_key) { ctx->key[1].key[0] = get_unaligned_le64(backup_key); ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8); ctx->num = 2; } else { ctx->num = 1; } if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx))); } else { octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, RCU_INITIALIZER(ctx))); } if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); out: return err; } int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key) { struct tcp_fastopen_context *ctx; int n_keys = 0, i; rcu_read_lock(); if (icsk) ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx); else ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctx) { n_keys = tcp_fastopen_context_len(ctx); for (i = 0; i < n_keys; i++) { put_unaligned_le64(ctx->key[i].key[0], key + (i * 2)); put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1); } } rcu_read_unlock(); return n_keys; } static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req, struct sk_buff *syn, const siphash_key_t *key, struct tcp_fastopen_cookie *foc) { BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64)); if (req->rsk_ops->family == AF_INET) { const struct iphdr *iph = ip_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&iph->saddr, sizeof(iph->saddr) + sizeof(iph->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #if IS_ENABLED(CONFIG_IPV6) if (req->rsk_ops->family == AF_INET6) { const struct ipv6hdr *ip6h = ipv6_hdr(syn); foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr, sizeof(ip6h->saddr) + sizeof(ip6h->daddr), key)); foc->len = TCP_FASTOPEN_COOKIE_SIZE; return true; } #endif return false; } /* Generate the fastopen cookie by applying SipHash to both the source and * destination addresses. */ static void tcp_fastopen_cookie_gen(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *foc) { struct tcp_fastopen_context *ctx; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (ctx) __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc); rcu_read_unlock(); } /* If an incoming SYN or SYNACK frame contains a payload and/or FIN, * queue this additional data / FIN. */ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects * skb->len to include the tcp_hdrlen. Hence, it should * be called before __skb_pull(). */ tp->segs_in = 0; tcp_segs_in(tp, skb); __skb_pull(skb, tcp_hdrlen(skb)); sk_forced_mem_schedule(sk, skb->truesize); skb_set_owner_r(skb, sk); TCP_SKB_CB(skb)->seq++; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, * as we certainly are not changing upper 32bit value (0) */ tp->bytes_received = skb->len; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); } /* returns 0 - no key match, 1 for primary, 2 for backup */ static int tcp_fastopen_cookie_gen_check(struct sock *sk, struct request_sock *req, struct sk_buff *syn, struct tcp_fastopen_cookie *orig, struct tcp_fastopen_cookie *valid_foc) { struct tcp_fastopen_cookie search_foc = { .len = -1 }; struct tcp_fastopen_cookie *foc = valid_foc; struct tcp_fastopen_context *ctx; int i, ret = 0; rcu_read_lock(); ctx = tcp_fastopen_get_ctx(sk); if (!ctx) goto out; for (i = 0; i < tcp_fastopen_context_len(ctx); i++) { __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc); if (tcp_fastopen_cookie_match(foc, orig)) { ret = i + 1; goto out; } foc = &search_foc; } out: rcu_read_unlock(); return ret; } static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; bool own_req; child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, NULL, &own_req); if (!child) return NULL; spin_lock(&queue->fastopenq.lock); queue->fastopenq.qlen++; spin_unlock(&queue->fastopenq.lock); /* Initialize the child socket. Have to fix some values to take * into account the child is a Fast Open socket and is created * only out of the bits carried in the SYN packet. */ tp = tcp_sk(child); rcu_assign_pointer(tp->fastopen_rsk, req); tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. */ tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->max_window = tp->snd_wnd; /* Activate the retrans timer so that SYNACK can be retransmitted. * The request socket is not added to the ehash * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS, req->timeout, false); refcount_set(&req->rsk_refcnt, 2); /* Now finish processing the fastopen child socket. */ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_fastopen_add_skb(child, skb); tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ return child; } static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying * to validating the cookie in order to avoid burning CPU cycles * unnecessarily. * * XXX (TFO) - The implication of checking the max_qlen before * processing a cookie request is that clients can't differentiate * between qlen overflow causing Fast Open to be disabled * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; max_qlen = READ_ONCE(fastopenq->max_qlen); if (max_qlen == 0) return false; if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); return false; } fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); reqsk_put(req1); } return true; } static bool tcp_fastopen_no_cookie(const struct sock *sk, const struct dst_entry *dst, int flag) { return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) || tcp_sk(sk)->fastopen_no_cookie || (dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE)); } /* Returns true if we should perform Fast Open on the SYN. The cookie (foc) * may be updated and return the client in the SYN-ACK later. E.g., Fast Open * cookie request (foc->len == 0). */ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst) { bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1; int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); struct tcp_fastopen_cookie valid_foc = { .len = -1 }; struct sock *child; int ret = 0; if (foc->len == 0) /* Client requests a cookie */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && tcp_fastopen_queue_check(sk))) { foc->len = -1; return NULL; } if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { /* Client requests a cookie. */ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc); } else if (foc->len > 0) { ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc, &valid_foc); if (!ret) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else { /* Cookie is valid. Create a (full) child socket to * accept the data in SYN before returning a SYN-ACK to * ack the data. If we fail to create the socket, fall * back and ack the ISN only but includes the same * cookie. * * Note: Data-less SYN with valid cookie is allowed to * send data in SYN_RECV state. */ fastopen: child = tcp_fastopen_create_child(sk, skb, req); if (child) { if (ret == 2) { valid_foc.exp = foc->exp; *foc = valid_foc; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEALTKEY); } else { foc->len = -1; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); tcp_sk(child)->syn_fastopen_child = 1; return child; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } } valid_foc.exp = foc->exp; *foc = valid_foc; return NULL; } bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { const struct dst_entry *dst; tcp_fastopen_cache_get(sk, mss, cookie); /* Firewall blackhole issue check */ if (tcp_fastopen_active_should_disable(sk)) { cookie->len = -1; return false; } dst = __sk_dst_get(sk); if (tcp_fastopen_no_cookie(sk, dst, TFO_CLIENT_NO_COOKIE)) { cookie->len = -1; return true; } if (cookie->len > 0) return true; tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE; return false; } /* This function checks if we want to defer sending SYN until the first * write(). We defer under the following conditions: * 1. fastopen_connect sockopt is set * 2. we have a valid cookie * Return value: return true if we want to defer until application writes data * return false if we want to send out SYN immediately */ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) { struct tcp_fastopen_cookie cookie = { .len = 0 }; struct tcp_sock *tp = tcp_sk(sk); u16 mss; if (tp->fastopen_connect && !tp->fastopen_req) { if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) { inet_set_bit(DEFER_CONNECT, sk); return true; } /* Alloc fastopen_req in order for FO option to be included * in SYN */ tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req), sk->sk_allocation); if (tp->fastopen_req) tp->fastopen_req->cookie = cookie; else *err = -ENOBUFS; } return false; } EXPORT_IPV6_MOD(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: * Middlebox firewall issues can potentially cause server's data being * blackholed after a successful 3WHS using TFO. * The proposed solution is to disable active TFO globally under the * following circumstances: * 1. client side TFO socket receives out of order FIN * 2. client side TFO socket receives out of order RST * 3. client side TFO socket has timed out three times consecutively during * or after handshake * We disable active side TFO globally for 1hr at first. Then if it * happens again, we disable it for 2h, then 4h, 8h, ... * And we reset the timeout back to 1hr when we see a successful active * TFO connection with data exchanges. */ /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ void tcp_fastopen_active_disable(struct sock *sk) { struct net *net = sock_net(sk); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout)) return; /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies); /* Paired with smp_rmb() in tcp_fastopen_active_should_disable(). * We want net->ipv4.tfo_active_disable_stamp to be updated first. */ smp_mb__before_atomic(); atomic_inc(&net->ipv4.tfo_active_disable_times); NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Calculate timeout for tfo active disable * Return true if we are still in the active TFO disable period * Return false if timeout already expired and we should use active TFO */ bool tcp_fastopen_active_should_disable(struct sock *sk) { unsigned int tfo_bh_timeout = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout); unsigned long timeout; int tfo_da_times; int multiplier; if (!tfo_bh_timeout) return false; tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times); if (!tfo_da_times) return false; /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */ smp_rmb(); /* Limit timeout to max: 2^6 * initial timeout */ multiplier = 1 << min(tfo_da_times - 1, 6); /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) + multiplier * tfo_bh_timeout * HZ; if (time_before(jiffies, timeout)) return true; /* Mark check bit so we can check for successful active TFO * condition and reset tfo_active_disable_times */ tcp_sk(sk)->syn_fastopen_ch = 1; return false; } /* Disable active TFO if FIN is the only packet in the ofo queue * and no data is received. * Also check if we can reset tfo_active_disable_times if data is * received successfully on a marked active TFO sockets opened on * a non-loopback interface */ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst; struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { skb = skb_rb_first(&tp->out_of_order_queue); if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; } } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } } void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired) { u32 timeouts = inet_csk(sk)->icsk_retransmits; struct tcp_sock *tp = tcp_sk(sk); /* Broken middle-boxes may black-hole Fast Open connection during or * even after the handshake. Be extremely conservative and pause * Fast Open globally after hitting the third consecutive timeout or * exceeding the configured timeout limit. */ if ((tp->syn_fastopen || tp->syn_data || tp->syn_data_acked) && (timeouts == 2 || (timeouts < 2 && expired))) { tcp_fastopen_active_disable(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } }
5 4 14 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_SCHED_RED_H #define __NET_SCHED_RED_H #include <linux/types.h> #include <linux/bug.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #include <net/dsfield.h> #include <linux/reciprocal_div.h> /* Random Early Detection (RED) algorithm. ======================================= Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. This file codes a "divisionless" version of RED algorithm as written down in Fig.17 of the paper. Short description. ------------------ When a new packet arrives we calculate the average queue length: avg = (1-W)*avg + W*current_queue_len, W is the filter time constant (chosen as 2^(-Wlog)), it controls the inertia of the algorithm. To allow larger bursts, W should be decreased. if (avg > th_max) -> packet marked (dropped). if (avg < th_min) -> packet passes. if (th_min < avg < th_max) we calculate probability: Pb = max_P * (avg - th_min)/(th_max-th_min) and mark (drop) packet with this probability. Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). max_P should be small (not 1), usually 0.01..0.02 is good value. max_P is chosen as a number, so that max_P/(th_max-th_min) is a negative power of two in order arithmetic to contain only shifts. Parameters, settable by user: ----------------------------- qth_min - bytes (should be < qth_max/2) qth_max - bytes (should be at least 2*qth_min and less limit) Wlog - bits (<32) log(1/W). Plog - bits (<32) Plog is related to max_P by formula: max_P = (qth_max-qth_min)/2^Plog; F.e. if qth_max=128K and qth_min=32K, then Plog=22 corresponds to max_P=0.02 Scell_log Stab Lookup table for log((1-W)^(t/t_ave). NOTES: Upper bound on W. ----------------- If you want to allow bursts of L packets of size S, you should choose W: L + 1 - th_min/S < (1-(1-W)^L)/W th_min/S = 32 th_min/S = 4 log(W) L -1 33 -2 35 -3 39 -4 46 -5 57 -6 75 -7 101 -8 135 -9 190 etc. */ /* * Adaptative RED : An Algorithm for Increasing the Robustness of RED's AQM * (Sally FLoyd, Ramakrishna Gummadi, and Scott Shenker) August 2001 * * Every 500 ms: * if (avg > target and max_p <= 0.5) * increase max_p : max_p += alpha; * else if (avg < target and max_p >= 0.01) * decrease max_p : max_p *= beta; * * target :[qth_min + 0.4*(qth_min - qth_max), * qth_min + 0.6*(qth_min - qth_max)]. * alpha : min(0.01, max_p / 4) * beta : 0.9 * max_P is a Q0.32 fixed point number (with 32 bits mantissa) * max_P between 0.01 and 0.5 (1% - 50%) [ Its no longer a negative power of two ] */ #define RED_ONE_PERCENT ((u32)DIV_ROUND_CLOSEST(1ULL<<32, 100)) #define MAX_P_MIN (1 * RED_ONE_PERCENT) #define MAX_P_MAX (50 * RED_ONE_PERCENT) #define MAX_P_ALPHA(val) min(MAX_P_MIN, val / 4) #define RED_STAB_SIZE 256 #define RED_STAB_MASK (RED_STAB_SIZE - 1) struct red_stats { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ }; struct red_parms { /* Parameters */ u32 qth_min; /* Min avg length threshold: Wlog scaled */ u32 qth_max; /* Max avg length threshold: Wlog scaled */ u32 Scell_max; u32 max_P; /* probability, [0 .. 1.0] 32 scaled */ /* reciprocal_value(max_P / qth_delta) */ struct reciprocal_value max_P_reciprocal; u32 qth_delta; /* max_th - min_th */ u32 target_min; /* min_th + 0.4*(max_th - min_th) */ u32 target_max; /* min_th + 0.6*(max_th - min_th) */ u8 Scell_log; u8 Wlog; /* log(W) */ u8 Plog; /* random number bits */ u8 Stab[RED_STAB_SIZE]; }; struct red_vars { /* Variables */ int qcount; /* Number of packets since last random number generation */ u32 qR; /* Cached random number */ unsigned long qavg; /* Average queue length: Wlog scaled */ ktime_t qidlestart; /* Start of current idle period */ }; static inline u32 red_maxp(u8 Plog) { return Plog < 32 ? (~0U >> Plog) : ~0U; } static inline void red_set_vars(struct red_vars *v) { /* Reset average queue length, the value is strictly bound * to the parameters below, resetting hurts a bit but leaving * it might result in an unreasonable qavg for a while. --TGR */ v->qavg = 0; v->qcount = -1; } static inline bool red_check_params(u32 qth_min, u32 qth_max, u8 Wlog, u8 Scell_log, u8 *stab) { if (fls(qth_min) + Wlog >= 32) return false; if (fls(qth_max) + Wlog >= 32) return false; if (Scell_log >= 32) return false; if (qth_max < qth_min) return false; if (stab) { int i; for (i = 0; i < RED_STAB_SIZE; i++) if (stab[i] >= 32) return false; } return true; } static inline int red_get_flags(unsigned char qopt_flags, unsigned char historic_mask, struct nlattr *flags_attr, unsigned char supported_mask, struct nla_bitfield32 *p_flags, unsigned char *p_userbits, struct netlink_ext_ack *extack) { struct nla_bitfield32 flags; if (qopt_flags && flags_attr) { NL_SET_ERR_MSG_MOD(extack, "flags should be passed either through qopt, or through a dedicated attribute"); return -EINVAL; } if (flags_attr) { flags = nla_get_bitfield32(flags_attr); } else { flags.selector = historic_mask; flags.value = qopt_flags & historic_mask; } *p_flags = flags; *p_userbits = qopt_flags & ~historic_mask; return 0; } static inline int red_validate_flags(unsigned char flags, struct netlink_ext_ack *extack) { if ((flags & TC_RED_NODROP) && !(flags & TC_RED_ECN)) { NL_SET_ERR_MSG_MOD(extack, "nodrop mode is only meaningful with ECN"); return -EINVAL; } return 0; } static inline void red_set_parms(struct red_parms *p, u32 qth_min, u32 qth_max, u8 Wlog, u8 Plog, u8 Scell_log, u8 *stab, u32 max_P) { int delta = qth_max - qth_min; u32 max_p_delta; WRITE_ONCE(p->qth_min, qth_min << Wlog); WRITE_ONCE(p->qth_max, qth_max << Wlog); WRITE_ONCE(p->Wlog, Wlog); WRITE_ONCE(p->Plog, Plog); if (delta <= 0) delta = 1; p->qth_delta = delta; if (!max_P) { max_P = red_maxp(Plog); max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */ } WRITE_ONCE(p->max_P, max_P); max_p_delta = max_P / delta; max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); /* RED Adaptative target : * [min_th + 0.4*(min_th - max_th), * min_th + 0.6*(min_th - max_th)]. */ delta /= 5; p->target_min = qth_min + 2*delta; p->target_max = qth_min + 3*delta; WRITE_ONCE(p->Scell_log, Scell_log); p->Scell_max = (255 << Scell_log); if (stab) memcpy(p->Stab, stab, sizeof(p->Stab)); } static inline int red_is_idling(const struct red_vars *v) { return v->qidlestart != 0; } static inline void red_start_of_idle_period(struct red_vars *v) { v->qidlestart = ktime_get(); } static inline void red_end_of_idle_period(struct red_vars *v) { v->qidlestart = 0; } static inline void red_restart(struct red_vars *v) { red_end_of_idle_period(v); v->qavg = 0; v->qcount = -1; } static inline unsigned long red_calc_qavg_from_idle_time(const struct red_parms *p, const struct red_vars *v) { s64 delta = ktime_us_delta(ktime_get(), v->qidlestart); long us_idle = min_t(s64, delta, p->Scell_max); int shift; /* * The problem: ideally, average length queue recalculation should * be done over constant clock intervals. This is too expensive, so * that the calculation is driven by outgoing packets. * When the queue is idle we have to model this clock by hand. * * SF+VJ proposed to "generate": * * m = idletime / (average_pkt_size / bandwidth) * * dummy packets as a burst after idle time, i.e. * * v->qavg *= (1-W)^m * * This is an apparently overcomplicated solution (f.e. we have to * precompute a table to make this calculation in reasonable time) * I believe that a simpler model may be used here, * but it is field for experiments. */ shift = p->Stab[(us_idle >> p->Scell_log) & RED_STAB_MASK]; if (shift) return v->qavg >> shift; else { /* Approximate initial part of exponent with linear function: * * (1-W)^m ~= 1-mW + ... * * Seems, it is the best solution to * problem of too coarse exponent tabulation. */ us_idle = (v->qavg * (u64)us_idle) >> p->Scell_log; if (us_idle < (v->qavg >> 1)) return v->qavg - us_idle; else return v->qavg >> 1; } } static inline unsigned long red_calc_qavg_no_idle_time(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { /* * NOTE: v->qavg is fixed point number with point at Wlog. * The formula below is equivalent to floating point * version: * * qavg = qavg*(1-W) + backlog*W; * * --ANK (980924) */ return v->qavg + (backlog - (v->qavg >> p->Wlog)); } static inline unsigned long red_calc_qavg(const struct red_parms *p, const struct red_vars *v, unsigned int backlog) { if (!red_is_idling(v)) return red_calc_qavg_no_idle_time(p, v, backlog); else return red_calc_qavg_from_idle_time(p, v); } static inline u32 red_random(const struct red_parms *p) { return reciprocal_divide(get_random_u32(), p->max_P_reciprocal); } static inline int red_mark_probability(const struct red_parms *p, const struct red_vars *v, unsigned long qavg) { /* The formula used below causes questions. OK. qR is random number in the interval (0..1/max_P)*(qth_max-qth_min) i.e. 0..(2^Plog). If we used floating point arithmetic, it would be: (2^Plog)*rnd_num, where rnd_num is less 1. Taking into account, that qavg have fixed point at Wlog, two lines below have the following floating point equivalent: max_P*(qavg - qth_min)/(qth_max-qth_min) < rnd/qcount Any questions? --ANK (980924) */ return !(((qavg - p->qth_min) >> p->Wlog) * v->qcount < v->qR); } enum { RED_BELOW_MIN_THRESH, RED_BETWEEN_TRESH, RED_ABOVE_MAX_TRESH, }; static inline int red_cmp_thresh(const struct red_parms *p, unsigned long qavg) { if (qavg < p->qth_min) return RED_BELOW_MIN_THRESH; else if (qavg >= p->qth_max) return RED_ABOVE_MAX_TRESH; else return RED_BETWEEN_TRESH; } enum { RED_DONT_MARK, RED_PROB_MARK, RED_HARD_MARK, }; static inline int red_action(const struct red_parms *p, struct red_vars *v, unsigned long qavg) { switch (red_cmp_thresh(p, qavg)) { case RED_BELOW_MIN_THRESH: v->qcount = -1; return RED_DONT_MARK; case RED_BETWEEN_TRESH: if (++v->qcount) { if (red_mark_probability(p, v, qavg)) { v->qcount = 0; v->qR = red_random(p); return RED_PROB_MARK; } } else v->qR = red_random(p); return RED_DONT_MARK; case RED_ABOVE_MAX_TRESH: v->qcount = -1; return RED_HARD_MARK; } BUG(); return RED_DONT_MARK; } static inline void red_adaptative_algo(struct red_parms *p, struct red_vars *v) { unsigned long qavg; u32 max_p_delta; qavg = v->qavg; if (red_is_idling(v)) qavg = red_calc_qavg_from_idle_time(p, v); /* v->qavg is fixed point number with point at Wlog */ qavg >>= p->Wlog; if (qavg > p->target_max && p->max_P <= MAX_P_MAX) p->max_P += MAX_P_ALPHA(p->max_P); /* maxp = maxp + alpha */ else if (qavg < p->target_min && p->max_P >= MAX_P_MIN) p->max_P = (p->max_P/10)*9; /* maxp = maxp * Beta */ max_p_delta = DIV_ROUND_CLOSEST(p->max_P, p->qth_delta); max_p_delta = max(max_p_delta, 1U); p->max_P_reciprocal = reciprocal_value(max_p_delta); } #endif
9 6 3 2 3 2 1 9 5 2 2 2 3 2 1 2 9 5 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm * Glue for AES-NI + SSE4.1 implementation * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> #include <asm/fpu/api.h> #include <asm/cpu_device_id.h> #define AEGIS128_BLOCK_ALIGN 16 #define AEGIS128_BLOCK_SIZE 16 #define AEGIS128_NONCE_SIZE 16 #define AEGIS128_STATE_BLOCKS 5 #define AEGIS128_KEY_SIZE 16 #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 struct aegis_block { u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); }; struct aegis_state { struct aegis_block blocks[AEGIS128_STATE_BLOCKS]; }; struct aegis_ctx { struct aegis_block key; }; asmlinkage void aegis128_aesni_init(struct aegis_state *state, const struct aegis_block *key, const u8 iv[AEGIS128_NONCE_SIZE]); asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, unsigned int len); asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, u8 *dst, unsigned int len); asmlinkage void aegis128_aesni_final(struct aegis_state *state, struct aegis_block *tag_xor, unsigned int assoclen, unsigned int cryptlen); static void crypto_aegis128_aesni_process_ad( struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen) { struct scatter_walk walk; struct aegis_block buf; unsigned int pos = 0; scatterwalk_start(&walk, sg_src); while (assoclen != 0) { unsigned int size = scatterwalk_next(&walk, assoclen); const u8 *src = walk.addr; unsigned int left = size; if (pos + size >= AEGIS128_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS128_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); pos = 0; left -= fill; src += fill; } aegis128_aesni_ad(state, src, left & ~(AEGIS128_BLOCK_SIZE - 1)); src += left & ~(AEGIS128_BLOCK_SIZE - 1); left &= AEGIS128_BLOCK_SIZE - 1; } memcpy(buf.bytes + pos, src, left); pos += left; assoclen -= size; scatterwalk_done_src(&walk, size); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); } } static __always_inline void crypto_aegis128_aesni_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, bool enc) { while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { if (enc) aegis128_aesni_enc(state, walk->src.virt.addr, walk->dst.virt.addr, round_down(walk->nbytes, AEGIS128_BLOCK_SIZE)); else aegis128_aesni_dec(state, walk->src.virt.addr, walk->dst.virt.addr, round_down(walk->nbytes, AEGIS128_BLOCK_SIZE)); skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); } if (walk->nbytes) { if (enc) aegis128_aesni_enc_tail(state, walk->src.virt.addr, walk->dst.virt.addr, walk->nbytes); else aegis128_aesni_dec_tail(state, walk->src.virt.addr, walk->dst.virt.addr, walk->nbytes); skcipher_walk_done(walk, 0); } } static struct aegis_ctx *crypto_aegis128_aesni_ctx(struct crypto_aead *aead) { u8 *ctx = crypto_aead_ctx(aead); ctx = PTR_ALIGN(ctx, __alignof__(struct aegis_ctx)); return (void *)ctx; } static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead); if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; } static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize > AEGIS128_MAX_AUTH_SIZE) return -EINVAL; if (authsize < AEGIS128_MIN_AUTH_SIZE) return -EINVAL; return 0; } static __always_inline void crypto_aegis128_aesni_crypt(struct aead_request *req, struct aegis_block *tag_xor, unsigned int cryptlen, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; if (enc) skcipher_walk_aead_encrypt(&walk, req, true); else skcipher_walk_aead_decrypt(&walk, req, true); kernel_fpu_begin(); aegis128_aesni_init(&state, &ctx->key, req->iv); crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); crypto_aegis128_aesni_process_crypt(&state, &walk, enc); aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); kernel_fpu_end(); } static int crypto_aegis128_aesni_encrypt(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen; crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_aesni_decrypt(struct aead_request *req) { static const struct aegis_block zeros = {}; struct crypto_aead *tfm = crypto_aead_reqtfm(req); struct aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; } static struct aead_alg crypto_aegis128_aesni_alg = { .setkey = crypto_aegis128_aesni_setkey, .setauthsize = crypto_aegis128_aesni_setauthsize, .encrypt = crypto_aegis128_aesni_encrypt, .decrypt = crypto_aegis128_aesni_decrypt, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS128_BLOCK_SIZE, .base = { .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aegis_ctx) + __alignof__(struct aegis_ctx), .cra_priority = 400, .cra_name = "__aegis128", .cra_driver_name = "__aegis128-aesni", .cra_module = THIS_MODULE, } }; static struct simd_aead_alg *simd_alg; static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM4_1) || !boot_cpu_has(X86_FEATURE_AES) || !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, &simd_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128_aesni_module_init); module_exit(crypto_aegis128_aesni_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-aesni");
4 4 4 4 6 6 3 5 1 6 3 1 5 9 8 4 4 4 3 3 4 4 4 4 4 4 4 4 10 6 3 5 5 6 6 6 4 4 4 1 4 3 4 4 4 4 5 5 1 4 5 6 6 6 1 10 10 8 3 3 8 4 10 53 11 8 53 53 53 53 1 1 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 1999 - 2004 Intel Corporation. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/if_bonding.h> #include <linux/pkt_sched.h> #include <net/net_namespace.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/netlink.h> /* General definitions */ #define AD_SHORT_TIMEOUT 1 #define AD_LONG_TIMEOUT 0 #define AD_STANDBY 0x2 #define AD_MAX_TX_IN_SECOND 3 #define AD_COLLECTOR_MAX_DELAY 0 /* Timer definitions (43.4.4 in the 802.3ad standard) */ #define AD_FAST_PERIODIC_TIME 1 #define AD_SLOW_PERIODIC_TIME 30 #define AD_SHORT_TIMEOUT_TIME (3*AD_FAST_PERIODIC_TIME) #define AD_LONG_TIMEOUT_TIME (3*AD_SLOW_PERIODIC_TIME) #define AD_CHURN_DETECTION_TIME 60 #define AD_AGGREGATE_WAIT_TIME 2 /* Port Variables definitions used by the State Machines (43.4.7 in the * 802.3ad standard) */ #define AD_PORT_BEGIN 0x1 #define AD_PORT_LACP_ENABLED 0x2 #define AD_PORT_ACTOR_CHURN 0x4 #define AD_PORT_PARTNER_CHURN 0x8 #define AD_PORT_READY 0x10 #define AD_PORT_READY_N 0x20 #define AD_PORT_MATCHED 0x40 #define AD_PORT_STANDBY 0x80 #define AD_PORT_SELECTED 0x100 #define AD_PORT_MOVED 0x200 #define AD_PORT_CHURNED (AD_PORT_ACTOR_CHURN | AD_PORT_PARTNER_CHURN) /* Port Key definitions * key is determined according to the link speed, duplex and * user key (which is yet not supported) * -------------------------------------------------------------- * Port key | User key (10 bits) | Speed (5 bits) | Duplex| * -------------------------------------------------------------- * |15 6|5 1|0 */ #define AD_DUPLEX_KEY_MASKS 0x1 #define AD_SPEED_KEY_MASKS 0x3E #define AD_USER_KEY_MASKS 0xFFC0 enum ad_link_speed_type { AD_LINK_SPEED_1MBPS = 1, AD_LINK_SPEED_10MBPS, AD_LINK_SPEED_100MBPS, AD_LINK_SPEED_1000MBPS, AD_LINK_SPEED_2500MBPS, AD_LINK_SPEED_5000MBPS, AD_LINK_SPEED_10000MBPS, AD_LINK_SPEED_14000MBPS, AD_LINK_SPEED_20000MBPS, AD_LINK_SPEED_25000MBPS, AD_LINK_SPEED_40000MBPS, AD_LINK_SPEED_50000MBPS, AD_LINK_SPEED_56000MBPS, AD_LINK_SPEED_100000MBPS, AD_LINK_SPEED_200000MBPS, AD_LINK_SPEED_400000MBPS, AD_LINK_SPEED_800000MBPS, }; /* compare MAC addresses */ #define MAC_ADDRESS_EQUAL(A, B) \ ether_addr_equal_64bits((const u8 *)A, (const u8 *)B) static const u16 ad_ticks_per_sec = 1000 / AD_TIMER_INTERVAL; static const int ad_delta_in_ticks = (AD_TIMER_INTERVAL * HZ) / 1000; const u8 lacpdu_mcast_addr[ETH_ALEN + 2] __long_aligned = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x02 }; /* ================= main 802.3ad protocol functions ================== */ static int ad_lacpdu_send(struct port *port); static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); static void ad_initialize_port(struct port *port, int lacp_fast); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr); static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port); static void ad_marker_response_received(struct bond_marker *marker, struct port *port); static void ad_update_actor_keys(struct port *port, bool reset); /* ================= api to bonding and kernel code ================== */ /** * __get_bond_by_port - get the port's bonding struct * @port: the port we're looking at * * Return @port's bonding struct, or %NULL if it can't be found. */ static inline struct bonding *__get_bond_by_port(struct port *port) { if (port->slave == NULL) return NULL; return bond_get_bond_by_slave(port->slave); } /** * __get_first_agg - get the first aggregator in the bond * @port: the port we're looking at * * Return the aggregator of the first slave in @bond, or %NULL if it can't be * found. * The caller must hold RCU or RTNL lock. */ static inline struct aggregator *__get_first_agg(struct port *port) { struct bonding *bond = __get_bond_by_port(port); struct slave *first_slave; struct aggregator *agg; /* If there's no bond for this port, or bond has no slaves */ if (bond == NULL) return NULL; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); agg = first_slave ? &(SLAVE_AD_INFO(first_slave)->aggregator) : NULL; rcu_read_unlock(); return agg; } /** * __agg_has_partner - see if we have a partner * @agg: the agregator we're looking at * * Return nonzero if aggregator has a partner (denoted by a non-zero ether * address for the partner). Return 0 if not. */ static inline int __agg_has_partner(struct aggregator *agg) { return !is_zero_ether_addr(agg->partner_system.mac_addr_value); } /** * __disable_distributing_port - disable the port's slave for distributing. * Port will still be able to collect. * @port: the port we're looking at * * This will disable only distributing on the port's slave. */ static void __disable_distributing_port(struct port *port) { bond_set_slave_tx_disabled_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_collecting_port - enable the port's slave for collecting, * if it's up * @port: the port we're looking at * * This will enable only collecting on the port's slave. */ static void __enable_collecting_port(struct port *port) { struct slave *slave = port->slave; if (slave->link == BOND_LINK_UP && bond_slave_is_up(slave)) bond_set_slave_rx_enabled_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __disable_port - disable the port's slave * @port: the port we're looking at * * This will disable both collecting and distributing on the port's slave. */ static inline void __disable_port(struct port *port) { bond_set_slave_inactive_flags(port->slave, BOND_SLAVE_NOTIFY_LATER); } /** * __enable_port - enable the port's slave, if it's up * @port: the port we're looking at * * This will enable both collecting and distributing on the port's slave. */ static inline void __enable_port(struct port *port) { struct slave *slave = port->slave; if ((slave->link == BOND_LINK_UP) && bond_slave_is_up(slave)) bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_LATER); } /** * __port_move_to_attached_state - check if port should transition back to attached * state. * @port: the port we're looking at */ static bool __port_move_to_attached_state(struct port *port) { if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) port->sm_mux_state = AD_MUX_ATTACHED; return port->sm_mux_state == AD_MUX_ATTACHED; } /** * __port_is_collecting_distributing - check if the port's slave is in the * combined collecting/distributing state * @port: the port we're looking at */ static int __port_is_collecting_distributing(struct port *port) { return bond_is_active_slave(port->slave); } /** * __get_agg_selection_mode - get the aggregator selection mode * @port: the port we're looking at * * Get the aggregator selection mode. Can be %STABLE, %BANDWIDTH or %COUNT. */ static inline u32 __get_agg_selection_mode(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return BOND_AD_STABLE; return bond->params.ad_select; } /** * __check_agg_selection_timer - check if the selection timer has expired * @port: the port we're looking at */ static inline int __check_agg_selection_timer(struct port *port) { struct bonding *bond = __get_bond_by_port(port); if (bond == NULL) return 0; return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** * __get_link_speed - get a port's speed * @port: the port we're looking at * * Return @port's speed in 802.3ad enum format. i.e. one of: * 0, * %AD_LINK_SPEED_10MBPS, * %AD_LINK_SPEED_100MBPS, * %AD_LINK_SPEED_1000MBPS, * %AD_LINK_SPEED_2500MBPS, * %AD_LINK_SPEED_5000MBPS, * %AD_LINK_SPEED_10000MBPS * %AD_LINK_SPEED_14000MBPS, * %AD_LINK_SPEED_20000MBPS * %AD_LINK_SPEED_25000MBPS * %AD_LINK_SPEED_40000MBPS * %AD_LINK_SPEED_50000MBPS * %AD_LINK_SPEED_56000MBPS * %AD_LINK_SPEED_100000MBPS * %AD_LINK_SPEED_200000MBPS * %AD_LINK_SPEED_400000MBPS * %AD_LINK_SPEED_800000MBPS */ static u16 __get_link_speed(struct port *port) { struct slave *slave = port->slave; u16 speed; /* this if covers only a special case: when the configuration starts * with link down, it sets the speed to 0. * This is done in spite of the fact that the e100 driver reports 0 * to be compatible with MVT in the future. */ if (slave->link != BOND_LINK_UP) speed = 0; else { switch (slave->speed) { case SPEED_10: speed = AD_LINK_SPEED_10MBPS; break; case SPEED_100: speed = AD_LINK_SPEED_100MBPS; break; case SPEED_1000: speed = AD_LINK_SPEED_1000MBPS; break; case SPEED_2500: speed = AD_LINK_SPEED_2500MBPS; break; case SPEED_5000: speed = AD_LINK_SPEED_5000MBPS; break; case SPEED_10000: speed = AD_LINK_SPEED_10000MBPS; break; case SPEED_14000: speed = AD_LINK_SPEED_14000MBPS; break; case SPEED_20000: speed = AD_LINK_SPEED_20000MBPS; break; case SPEED_25000: speed = AD_LINK_SPEED_25000MBPS; break; case SPEED_40000: speed = AD_LINK_SPEED_40000MBPS; break; case SPEED_50000: speed = AD_LINK_SPEED_50000MBPS; break; case SPEED_56000: speed = AD_LINK_SPEED_56000MBPS; break; case SPEED_100000: speed = AD_LINK_SPEED_100000MBPS; break; case SPEED_200000: speed = AD_LINK_SPEED_200000MBPS; break; case SPEED_400000: speed = AD_LINK_SPEED_400000MBPS; break; case SPEED_800000: speed = AD_LINK_SPEED_800000MBPS; break; default: /* unknown speed value from ethtool. shouldn't happen */ if (slave->speed != SPEED_UNKNOWN) pr_err_once("%s: (slave %s): unknown ethtool speed (%d) for port %d (set it to 0)\n", slave->bond->dev->name, slave->dev->name, slave->speed, port->actor_port_number); speed = 0; break; } } slave_dbg(slave->bond->dev, slave->dev, "Port %d Received link speed %d update from adapter\n", port->actor_port_number, speed); return speed; } /** * __get_duplex - get a port's duplex * @port: the port we're looking at * * Return @port's duplex in 802.3ad bitmask format. i.e.: * 0x01 if in full duplex * 0x00 otherwise */ static u8 __get_duplex(struct port *port) { struct slave *slave = port->slave; u8 retval = 0x0; /* handling a special case: when the configuration starts with * link down, it sets the duplex to 0. */ if (slave->link == BOND_LINK_UP) { switch (slave->duplex) { case DUPLEX_FULL: retval = 0x1; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status full duplex update from adapter\n", port->actor_port_number); break; case DUPLEX_HALF: default: retval = 0x0; slave_dbg(slave->bond->dev, slave->dev, "Port %d Received status NOT full duplex update from adapter\n", port->actor_port_number); break; } } return retval; } static void __ad_actor_update_port(struct port *port) { const struct bonding *bond = bond_get_bond_by_slave(port->slave); port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; } /* Conversions */ /** * __ad_timer_to_ticks - convert a given timer type to AD module ticks * @timer_type: which timer to operate * @par: timer parameter. see below * * If @timer_type is %current_while_timer, @par indicates long/short timer. * If @timer_type is %periodic_timer, @par is one of %FAST_PERIODIC_TIME, * %SLOW_PERIODIC_TIME. */ static u16 __ad_timer_to_ticks(u16 timer_type, u16 par) { u16 retval = 0; /* to silence the compiler */ switch (timer_type) { case AD_CURRENT_WHILE_TIMER: /* for rx machine usage */ if (par) retval = (AD_SHORT_TIMEOUT_TIME*ad_ticks_per_sec); else retval = (AD_LONG_TIMEOUT_TIME*ad_ticks_per_sec); break; case AD_ACTOR_CHURN_TIMER: /* for local churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_PERIODIC_TIMER: /* for periodic machine */ retval = (par*ad_ticks_per_sec); /* long timeout */ break; case AD_PARTNER_CHURN_TIMER: /* for remote churn machine */ retval = (AD_CHURN_DETECTION_TIME*ad_ticks_per_sec); break; case AD_WAIT_WHILE_TIMER: /* for selection machine */ retval = (AD_AGGREGATE_WAIT_TIME*ad_ticks_per_sec); break; } return retval; } /* ================= ad_rx_machine helper functions ================== */ /** * __choose_matched - update a port's matched variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the matched variable, using parameter values from a * newly received lacpdu. Parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the actor. Matched is set to TRUE if all of these parameters * match and the PDU parameter partner_state.aggregation has the same value as * actor_oper_port_state.aggregation and lacp will actively maintain the link * in the aggregation. Matched is also set to TRUE if the value of * actor_state.aggregation in the received PDU is set to FALSE, i.e., indicates * an individual link and lacp will actively maintain the link. Otherwise, * matched is set to FALSE. LACP is considered to be actively maintaining the * link if either the PDU's actor_state.lacp_activity variable is TRUE or both * the actor's actor_oper_port_state.lacp_activity and the PDU's * partner_state.lacp_activity variables are TRUE. * * Note: the AD_PORT_MATCHED "variable" is not specified by 802.3ad; it is * used here to implement the language from 802.3ad 43.4.9 that requires * recordPDU to "match" the LACPDU parameters to the stored values. */ static void __choose_matched(struct lacpdu *lacpdu, struct port *port) { /* check if all parameters are alike * or this is individual link(aggregation == FALSE) * then update the state machine Matched variable. */ if (((ntohs(lacpdu->partner_port) == port->actor_port_number) && (ntohs(lacpdu->partner_port_priority) == port->actor_port_priority) && MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) && (ntohs(lacpdu->partner_system_priority) == port->actor_system_priority) && (ntohs(lacpdu->partner_key) == port->actor_oper_port_key) && ((lacpdu->partner_state & LACP_STATE_AGGREGATION) == (port->actor_oper_port_state & LACP_STATE_AGGREGATION))) || ((lacpdu->actor_state & LACP_STATE_AGGREGATION) == 0) ) { port->sm_vars |= AD_PORT_MATCHED; } else { port->sm_vars &= ~AD_PORT_MATCHED; } } /** * __record_pdu - record parameters from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Record the parameter values for the Actor carried in a received lacpdu as * the current partner operational parameter values and sets * actor_oper_port_state.defaulted to FALSE. */ static void __record_pdu(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { struct port_params *partner = &port->partner_oper; __choose_matched(lacpdu, port); /* record the new parameter values for the partner * operational */ partner->port_number = ntohs(lacpdu->actor_port); partner->port_priority = ntohs(lacpdu->actor_port_priority); partner->system = lacpdu->actor_system; partner->system_priority = ntohs(lacpdu->actor_system_priority); partner->key = ntohs(lacpdu->actor_key); partner->port_state = lacpdu->actor_state; /* set actor_oper_port_state.defaulted to FALSE */ port->actor_oper_port_state &= ~LACP_STATE_DEFAULTED; /* set the partner sync. to on if the partner is sync, * and the port is matched */ if ((port->sm_vars & AD_PORT_MATCHED) && (lacpdu->actor_state & LACP_STATE_SYNCHRONIZATION)) { partner->port_state |= LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=1\n"); } else { partner->port_state &= ~LACP_STATE_SYNCHRONIZATION; slave_dbg(port->slave->bond->dev, port->slave->dev, "partner sync=0\n"); } } } /** * __record_default - record default parameters * @port: the port we're looking at * * This function records the default parameter values for the partner carried * in the Partner Admin parameters as the current partner operational parameter * values and sets actor_oper_port_state.defaulted to TRUE. */ static void __record_default(struct port *port) { if (port) { /* record the partner admin parameters */ memcpy(&port->partner_oper, &port->partner_admin, sizeof(struct port_params)); /* set actor_oper_port_state.defaulted to true */ port->actor_oper_port_state |= LACP_STATE_DEFAULTED; } } /** * __update_selected - update a port's Selected variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Update the value of the selected variable, using parameter values from a * newly received lacpdu. The parameter values for the Actor carried in the * received PDU are compared with the corresponding operational parameter * values for the ports partner. If one or more of the comparisons shows that * the value(s) received in the PDU differ from the current operational values, * then selected is set to FALSE and actor_oper_port_state.synchronization is * set to out_of_sync. Otherwise, selected remains unchanged. */ static void __update_selected(struct lacpdu *lacpdu, struct port *port) { if (lacpdu && port) { const struct port_params *partner = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (ntohs(lacpdu->actor_port) != partner->port_number || ntohs(lacpdu->actor_port_priority) != partner->port_priority || !MAC_ADDRESS_EQUAL(&lacpdu->actor_system, &partner->system) || ntohs(lacpdu->actor_system_priority) != partner->system_priority || ntohs(lacpdu->actor_key) != partner->key || (lacpdu->actor_state & LACP_STATE_AGGREGATION) != (partner->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_default_selected - update a port's Selected variable from Partner * @port: the port we're looking at * * This function updates the value of the selected variable, using the partner * administrative parameter values. The administrative values are compared with * the corresponding operational parameter values for the partner. If one or * more of the comparisons shows that the administrative value(s) differ from * the current operational values, then Selected is set to FALSE and * actor_oper_port_state.synchronization is set to OUT_OF_SYNC. Otherwise, * Selected remains unchanged. */ static void __update_default_selected(struct port *port) { if (port) { const struct port_params *admin = &port->partner_admin; const struct port_params *oper = &port->partner_oper; /* check if any parameter is different then * update the state machine selected variable. */ if (admin->port_number != oper->port_number || admin->port_priority != oper->port_priority || !MAC_ADDRESS_EQUAL(&admin->system, &oper->system) || admin->system_priority != oper->system_priority || admin->key != oper->key || (admin->port_state & LACP_STATE_AGGREGATION) != (oper->port_state & LACP_STATE_AGGREGATION)) { port->sm_vars &= ~AD_PORT_SELECTED; } } } /** * __update_ntt - update a port's ntt variable from a received lacpdu * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * Updates the value of the ntt variable, using parameter values from a newly * received lacpdu. The parameter values for the partner carried in the * received PDU are compared with the corresponding operational parameter * values for the Actor. If one or more of the comparisons shows that the * value(s) received in the PDU differ from the current operational values, * then ntt is set to TRUE. Otherwise, ntt remains unchanged. */ static void __update_ntt(struct lacpdu *lacpdu, struct port *port) { /* validate lacpdu and port */ if (lacpdu && port) { /* check if any parameter is different then * update the port->ntt. */ if ((ntohs(lacpdu->partner_port) != port->actor_port_number) || (ntohs(lacpdu->partner_port_priority) != port->actor_port_priority) || !MAC_ADDRESS_EQUAL(&(lacpdu->partner_system), &(port->actor_system)) || (ntohs(lacpdu->partner_system_priority) != port->actor_system_priority) || (ntohs(lacpdu->partner_key) != port->actor_oper_port_key) || ((lacpdu->partner_state & LACP_STATE_LACP_ACTIVITY) != (port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY)) || ((lacpdu->partner_state & LACP_STATE_LACP_TIMEOUT) != (port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)) || ((lacpdu->partner_state & LACP_STATE_SYNCHRONIZATION) != (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) || ((lacpdu->partner_state & LACP_STATE_AGGREGATION) != (port->actor_oper_port_state & LACP_STATE_AGGREGATION)) ) { port->ntt = true; } } } /** * __agg_ports_are_ready - check if all ports in an aggregator are ready * @aggregator: the aggregator we're looking at * */ static int __agg_ports_are_ready(struct aggregator *aggregator) { struct port *port; int retval = 1; if (aggregator) { /* scan all ports in this aggregator to verfy if they are * all ready. */ for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (!(port->sm_vars & AD_PORT_READY_N)) { retval = 0; break; } } } return retval; } /** * __set_agg_ports_ready - set value of Ready bit in all ports of an aggregator * @aggregator: the aggregator we're looking at * @val: Should the ports' ready bit be set on or off * */ static void __set_agg_ports_ready(struct aggregator *aggregator, int val) { struct port *port; for (port = aggregator->lag_ports; port; port = port->next_port_in_aggregator) { if (val) port->sm_vars |= AD_PORT_READY; else port->sm_vars &= ~AD_PORT_READY; } } static int __agg_active_ports(struct aggregator *agg) { struct port *port; int active = 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (port->is_enabled) active++; } return active; } /** * __get_agg_bandwidth - get the total bandwidth of an aggregator * @aggregator: the aggregator we're looking at * */ static u32 __get_agg_bandwidth(struct aggregator *aggregator) { int nports = __agg_active_ports(aggregator); u32 bandwidth = 0; if (nports) { switch (__get_link_speed(aggregator->lag_ports)) { case AD_LINK_SPEED_1MBPS: bandwidth = nports; break; case AD_LINK_SPEED_10MBPS: bandwidth = nports * 10; break; case AD_LINK_SPEED_100MBPS: bandwidth = nports * 100; break; case AD_LINK_SPEED_1000MBPS: bandwidth = nports * 1000; break; case AD_LINK_SPEED_2500MBPS: bandwidth = nports * 2500; break; case AD_LINK_SPEED_5000MBPS: bandwidth = nports * 5000; break; case AD_LINK_SPEED_10000MBPS: bandwidth = nports * 10000; break; case AD_LINK_SPEED_14000MBPS: bandwidth = nports * 14000; break; case AD_LINK_SPEED_20000MBPS: bandwidth = nports * 20000; break; case AD_LINK_SPEED_25000MBPS: bandwidth = nports * 25000; break; case AD_LINK_SPEED_40000MBPS: bandwidth = nports * 40000; break; case AD_LINK_SPEED_50000MBPS: bandwidth = nports * 50000; break; case AD_LINK_SPEED_56000MBPS: bandwidth = nports * 56000; break; case AD_LINK_SPEED_100000MBPS: bandwidth = nports * 100000; break; case AD_LINK_SPEED_200000MBPS: bandwidth = nports * 200000; break; case AD_LINK_SPEED_400000MBPS: bandwidth = nports * 400000; break; case AD_LINK_SPEED_800000MBPS: bandwidth = nports * 800000; break; default: bandwidth = 0; /* to silence the compiler */ } } return bandwidth; } /** * __get_active_agg - get the current active aggregator * @aggregator: the aggregator we're looking at * * Caller must hold RCU lock. */ static struct aggregator *__get_active_agg(struct aggregator *aggregator) { struct bonding *bond = aggregator->slave->bond; struct list_head *iter; struct slave *slave; bond_for_each_slave_rcu(bond, slave, iter) if (SLAVE_AD_INFO(slave)->aggregator.is_active) return &(SLAVE_AD_INFO(slave)->aggregator); return NULL; } /** * __update_lacpdu_from_port - update a port's lacpdu fields * @port: the port we're looking at */ static inline void __update_lacpdu_from_port(struct port *port) { struct lacpdu *lacpdu = &port->lacpdu; const struct port_params *partner = &port->partner_oper; /* update current actual Actor parameters * lacpdu->subtype initialized * lacpdu->version_number initialized * lacpdu->tlv_type_actor_info initialized * lacpdu->actor_information_length initialized */ lacpdu->actor_system_priority = htons(port->actor_system_priority); lacpdu->actor_system = port->actor_system; lacpdu->actor_key = htons(port->actor_oper_port_key); lacpdu->actor_port_priority = htons(port->actor_port_priority); lacpdu->actor_port = htons(port->actor_port_number); lacpdu->actor_state = port->actor_oper_port_state; slave_dbg(port->slave->bond->dev, port->slave->dev, "update lacpdu: actor port state %x\n", port->actor_oper_port_state); /* lacpdu->reserved_3_1 initialized * lacpdu->tlv_type_partner_info initialized * lacpdu->partner_information_length initialized */ lacpdu->partner_system_priority = htons(partner->system_priority); lacpdu->partner_system = partner->system; lacpdu->partner_key = htons(partner->key); lacpdu->partner_port_priority = htons(partner->port_priority); lacpdu->partner_port = htons(partner->port_number); lacpdu->partner_state = partner->port_state; /* lacpdu->reserved_3_2 initialized * lacpdu->tlv_type_collector_info initialized * lacpdu->collector_information_length initialized * collector_max_delay initialized * reserved_12[12] initialized * tlv_type_terminator initialized * terminator_length initialized * reserved_50[50] initialized */ } /* ================= main 802.3ad protocol code ========================= */ /** * ad_lacpdu_send - send out a lacpdu packet on a given port * @port: the port we're looking at * * Returns: 0 on success * < 0 on error */ static int ad_lacpdu_send(struct port *port) { struct slave *slave = port->slave; struct sk_buff *skb; struct lacpdu_header *lacpdu_header; int length = sizeof(struct lacpdu_header); skb = dev_alloc_skb(length); if (!skb) return -ENOMEM; atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.lacpdu_tx); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; skb->priority = TC_PRIO_CONTROL; lacpdu_header = skb_put(skb, length); ether_addr_copy(lacpdu_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback lacpdus in receive. */ ether_addr_copy(lacpdu_header->hdr.h_source, slave->perm_hwaddr); lacpdu_header->hdr.h_proto = PKT_TYPE_LACPDU; lacpdu_header->lacpdu = port->lacpdu; dev_queue_xmit(skb); return 0; } /** * ad_marker_send - send marker information/response on a given port * @port: the port we're looking at * @marker: marker data to send * * Returns: 0 on success * < 0 on error */ static int ad_marker_send(struct port *port, struct bond_marker *marker) { struct slave *slave = port->slave; struct sk_buff *skb; struct bond_marker_header *marker_header; int length = sizeof(struct bond_marker_header); skb = dev_alloc_skb(length + 16); if (!skb) return -ENOMEM; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_tx); break; case AD_MARKER_RESPONSE_SUBTYPE: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.marker_resp_tx); atomic64_inc(&BOND_AD_INFO(slave->bond).stats.marker_resp_tx); break; } skb_reserve(skb, 16); skb->dev = slave->dev; skb_reset_mac_header(skb); skb->network_header = skb->mac_header + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; marker_header = skb_put(skb, length); ether_addr_copy(marker_header->hdr.h_dest, lacpdu_mcast_addr); /* Note: source address is set to be the member's PERMANENT address, * because we use it to identify loopback MARKERs in receive. */ ether_addr_copy(marker_header->hdr.h_source, slave->perm_hwaddr); marker_header->hdr.h_proto = PKT_TYPE_LACPDU; marker_header->marker = *marker; dev_queue_xmit(skb); return 0; } /** * ad_mux_machine - handle a port's mux state machine * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); mux_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_mux_state; if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { switch (port->sm_mux_state) { case AD_MUX_DETACHED: if ((port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) /* if SELECTED or STANDBY */ port->sm_mux_state = AD_MUX_WAITING; break; case AD_MUX_WAITING: /* if SELECTED == FALSE return to DETACH state */ if (!(port->sm_vars & AD_PORT_SELECTED)) { port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the Selection Logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } /* check if the wait_while_timer expired */ if (port->sm_mux_timer_counter && !(--port->sm_mux_timer_counter)) port->sm_vars |= AD_PORT_READY_N; /* in order to withhold the selection logic to check * all ports READY_N value every callback cycle to * update ready variable, we check READY_N and update * READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state */ if ((port->sm_vars & AD_PORT_READY) && !port->sm_mux_timer_counter) port->sm_mux_state = AD_MUX_ATTACHED; break; case AD_MUX_ATTACHED: /* check also if agg_select_timer expired (so the * edable port will take place only after this timer) */ if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { if (port->aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) state = AD_MUX_COLLECTING; port->sm_mux_state = state; } } else if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY)) { /* if UNSELECTED or STANDBY */ port->sm_vars &= ~AD_PORT_READY_N; /* in order to withhold the selection logic to * check all ports READY_N value every callback * cycle to update ready variable, we check * READY_N and update READY here */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); port->sm_mux_state = AD_MUX_DETACHED; } else if (port->aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } break; case AD_MUX_COLLECTING_DISTRIBUTING: if (!__port_move_to_attached_state(port)) { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; case AD_MUX_COLLECTING: if (!__port_move_to_attached_state(port)) { if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && (port->partner_oper.port_state & LACP_STATE_COLLECTING)) { port->sm_mux_state = AD_MUX_DISTRIBUTING; } else { /* If port state hasn't changed, make sure that a collecting * port is enabled for an active aggregator. */ struct slave *slave = port->slave; if (port->aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; } } } break; case AD_MUX_DISTRIBUTING: if (!(port->sm_vars & AD_PORT_SELECTED) || (port->sm_vars & AD_PORT_STANDBY) || !(port->partner_oper.port_state & LACP_STATE_COLLECTING) || !(port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) || !(port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION)) { port->sm_mux_state = AD_MUX_COLLECTING; } else { /* if port state hasn't changed make * sure that a collecting distributing * port in an active aggregator is enabled */ if (port->aggregator && port->aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; } } break; default: break; } } /* check if the state machine was changed */ if (port->sm_mux_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Mux Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_mux_state); switch (port->sm_mux_state) { case AD_MUX_DETACHED: port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; ad_disable_collecting_distributing(port, update_slave_arr); port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->ntt = true; break; case AD_MUX_WAITING: port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: if (port->aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; ad_disable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_COLLECTING: port->actor_oper_port_state |= LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting(port); ad_disable_distributing(port, update_slave_arr); port->ntt = true; break; case AD_MUX_DISTRIBUTING: port->actor_oper_port_state |= LACP_STATE_DISTRIBUTING; port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; ad_enable_collecting_distributing(port, update_slave_arr); break; default: break; } } } /** * ad_rx_machine - handle a port's rx State Machine * @lacpdu: the lacpdu we've received * @port: the port we're looking at * * If lacpdu arrived, stop previous timer (if exists) and set the next state as * CURRENT. If timer expired set the state machine in the proper state. * In other cases, this function checks if we need to switch to other state. */ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) { rx_states_t last_state; /* keep current State Machine state to compare later if it was * changed */ last_state = port->sm_rx_state; if (lacpdu) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.lacpdu_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.lacpdu_rx); } /* check if state machine should change state */ /* first, check if port was reinitialized */ if (port->sm_vars & AD_PORT_BEGIN) { port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || (port->sm_rx_state == AD_RX_DEFAULTED) || (port->sm_rx_state == AD_RX_CURRENT))) { if (port->sm_rx_state != AD_RX_CURRENT) port->sm_vars |= AD_PORT_CHURNED; port->sm_rx_timer_counter = 0; port->sm_rx_state = AD_RX_CURRENT; } else { /* if timer is on, and if it is expired */ if (port->sm_rx_timer_counter && !(--port->sm_rx_timer_counter)) { switch (port->sm_rx_state) { case AD_RX_EXPIRED: port->sm_rx_state = AD_RX_DEFAULTED; break; case AD_RX_CURRENT: port->sm_rx_state = AD_RX_EXPIRED; break; default: break; } } else { /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: if (port->is_enabled && (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars & AD_PORT_LACP_ENABLED) == 0)) port->sm_rx_state = AD_RX_LACP_DISABLED; break; default: break; } } } /* check if the State machine was changed or new lacpdu arrived */ if ((port->sm_rx_state != last_state) || (lacpdu)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Rx Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_rx_state); switch (port->sm_rx_state) { case AD_RX_INITIALIZE: if (!(port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS)) port->sm_vars &= ~AD_PORT_LACP_ENABLED; else port->sm_vars |= AD_PORT_LACP_ENABLED; port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; port->sm_rx_state = AD_RX_PORT_DISABLED; fallthrough; case AD_RX_PORT_DISABLED: port->sm_vars &= ~AD_PORT_MATCHED; break; case AD_RX_LACP_DISABLED: port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->partner_oper.port_state &= ~LACP_STATE_AGGREGATION; port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_EXPIRED: /* Reset of the Synchronization flag (Standard 43.4.12) * This reset cause to disable this port in the * COLLECTING_DISTRIBUTING state of the mux machine in * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; break; case AD_RX_DEFAULTED: __update_default_selected(port); __record_default(port); port->sm_vars |= AD_PORT_MATCHED; port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; case AD_RX_CURRENT: /* detect loopback situation */ if (MAC_ADDRESS_EQUAL(&(lacpdu->actor_system), &(port->actor_system))) { slave_err(port->slave->bond->dev, port->slave->dev, "An illegal loopback occurred on slave\n" "Check the configuration to verify that all adapters are connected to 802.3ad compliant switch ports\n"); return; } __update_selected(lacpdu, port); __update_ntt(lacpdu, port); __record_pdu(lacpdu, port); port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(port->actor_oper_port_state & LACP_STATE_LACP_TIMEOUT)); port->actor_oper_port_state &= ~LACP_STATE_EXPIRED; break; default: break; } } } /** * ad_churn_machine - handle port churn's state machine * @port: the port we're looking at * */ static void ad_churn_machine(struct port *port) { if (port->sm_vars & AD_PORT_CHURNED) { port->sm_vars &= ~AD_PORT_CHURNED; port->sm_churn_actor_state = AD_CHURN_MONITOR; port->sm_churn_partner_state = AD_CHURN_MONITOR; port->sm_churn_actor_timer_counter = __ad_timer_to_ticks(AD_ACTOR_CHURN_TIMER, 0); port->sm_churn_partner_timer_counter = __ad_timer_to_ticks(AD_PARTNER_CHURN_TIMER, 0); return; } if (port->sm_churn_actor_timer_counter && !(--port->sm_churn_actor_timer_counter) && port->sm_churn_actor_state == AD_CHURN_MONITOR) { if (port->actor_oper_port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_actor_state = AD_NO_CHURN; } else { port->churn_actor_count++; port->sm_churn_actor_state = AD_CHURN; } } if (port->sm_churn_partner_timer_counter && !(--port->sm_churn_partner_timer_counter) && port->sm_churn_partner_state == AD_CHURN_MONITOR) { if (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) { port->sm_churn_partner_state = AD_NO_CHURN; } else { port->churn_partner_count++; port->sm_churn_partner_state = AD_CHURN; } } } /** * ad_tx_machine - handle a port's tx state machine * @port: the port we're looking at */ static void ad_tx_machine(struct port *port) { /* check if tx timer expired, to verify that we do not send more than * 3 packets per second */ if (port->sm_tx_timer_counter && !(--port->sm_tx_timer_counter)) { /* check if there is something to send */ if (port->ntt && (port->sm_vars & AD_PORT_LACP_ENABLED)) { __update_lacpdu_from_port(port); if (ad_lacpdu_send(port) >= 0) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent LACPDU on port %d\n", port->actor_port_number); /* mark ntt as false, so it will not be sent * again until demanded */ port->ntt = false; } } /* restart tx timer(to verify that we will not exceed * AD_MAX_TX_IN_SECOND */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; } } /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) { periodic_states_t last_state; /* keep current state machine state to compare later if it was changed */ last_state = port->sm_periodic_state; /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || !bond_params->lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ else if (port->sm_periodic_timer_counter) { /* check if periodic state machine expired */ if (!(--port->sm_periodic_timer_counter)) { /* if expired then do tx */ port->sm_periodic_state = AD_PERIODIC_TX; } else { /* If not expired, check if there is some new timeout * parameter from the partner state */ switch (port->sm_periodic_state) { case AD_FAST_PERIODIC: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; break; case AD_SLOW_PERIODIC: if ((port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) { port->sm_periodic_timer_counter = 0; port->sm_periodic_state = AD_PERIODIC_TX; } break; default: break; } } } else { switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_state = AD_FAST_PERIODIC; break; case AD_PERIODIC_TX: if (!(port->partner_oper.port_state & LACP_STATE_LACP_TIMEOUT)) port->sm_periodic_state = AD_SLOW_PERIODIC; else port->sm_periodic_state = AD_FAST_PERIODIC; break; default: break; } } /* check if the state machine was changed */ if (port->sm_periodic_state != last_state) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Periodic Machine: Port=%d, Last State=%d, Curr State=%d\n", port->actor_port_number, last_state, port->sm_periodic_state); switch (port->sm_periodic_state) { case AD_NO_PERIODIC: port->sm_periodic_timer_counter = 0; break; case AD_FAST_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_FAST_PERIODIC_TIME))-1; break; case AD_SLOW_PERIODIC: /* decrement 1 tick we lost in the PERIODIC_TX cycle */ port->sm_periodic_timer_counter = __ad_timer_to_ticks(AD_PERIODIC_TIMER, (u16)(AD_SLOW_PERIODIC_TIME))-1; break; case AD_PERIODIC_TX: port->ntt = true; break; default: break; } } } /** * ad_port_selection_logic - select aggregation groups * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Select aggregation groups, and assign each port for it's aggregetor. The * selection logic is called in the inititalization (after all the handshkes), * and after every lacpdu receive (if selected is off). */ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) { struct aggregator *aggregator, *free_aggregator = NULL, *temp_aggregator; struct port *last_port = NULL, *curr_port; struct list_head *iter; struct bonding *bond; struct slave *slave; int found = 0; /* if the port is already Selected, do nothing */ if (port->sm_vars & AD_PORT_SELECTED) return; bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ if (port->aggregator) { /* detach the port from its former aggregator */ temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { if (curr_port == port) { temp_aggregator->num_of_ports--; /* if it is the first port attached to the * aggregator */ if (!last_port) { temp_aggregator->lag_ports = port->next_port_in_aggregator; } else { /* not the first port attached to the * aggregator */ last_port->next_port_in_aggregator = port->next_port_in_aggregator; } /* clear the port's relations to this * aggregator */ port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; slave_dbg(bond->dev, port->slave->dev, "Port %d left LAG %d\n", port->actor_port_number, temp_aggregator->aggregator_identifier); /* if the aggregator is empty, clear its * parameters, and set it ready to be attached */ if (!temp_aggregator->lag_ports) ad_clear_agg(temp_aggregator); break; } } if (!curr_port) { /* meaning: the port was related to an aggregator * but was not on the aggregator port list */ net_warn_ratelimited("%s: (slave %s): Warning: Port %d was related to aggregator %d but was not on its port list\n", port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, port->aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ bond_for_each_slave(bond, slave, iter) { aggregator = &(SLAVE_AD_INFO(slave)->aggregator); /* keep a free aggregator for later use(if needed) */ if (!aggregator->lag_ports) { if (!free_aggregator) free_aggregator = aggregator; continue; } /* check if current aggregator suits us */ if (((aggregator->actor_oper_aggregator_key == port->actor_oper_port_key) && /* if all parameters match AND */ MAC_ADDRESS_EQUAL(&(aggregator->partner_system), &(port->partner_oper.system)) && (aggregator->partner_system_priority == port->partner_oper.system_priority) && (aggregator->partner_oper_aggregator_key == port->partner_oper.key) ) && ((__agg_has_partner(aggregator) && /* partner answers */ !aggregator->is_individual) /* but is not individual OR */ ) ) { /* attach to the founded aggregator */ port->aggregator = aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; port->aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; found = 1; break; } } /* the port couldn't find an aggregator - attach it to a new * aggregator */ if (!found) { if (free_aggregator) { /* assign port a new aggregator */ port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = port->aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ port->aggregator->is_individual = false; else port->aggregator->is_individual = true; port->aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; port->aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; port->aggregator->partner_system = port->partner_oper.system; port->aggregator->partner_system_priority = port->partner_oper.system_priority; port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; port->aggregator->receive_state = 1; port->aggregator->transmit_state = 1; port->aggregator->lag_ports = port; port->aggregator->num_of_ports++; /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, port->aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", port->actor_port_number); return; } } /* if all aggregator's ports are READY_N == TRUE, set ready=TRUE * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, update_slave_arr); if (!port->aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } /* Decide if "agg" is a better choice for the new active aggregator that * the current best, according to the ad_select policy. */ static struct aggregator *ad_agg_selection_test(struct aggregator *best, struct aggregator *curr) { /* 0. If no best, select current. * * 1. If the current agg is not individual, and the best is * individual, select current. * * 2. If current agg is individual and the best is not, keep best. * * 3. Therefore, current and best are both individual or both not * individual, so: * * 3a. If current agg partner replied, and best agg partner did not, * select current. * * 3b. If current agg partner did not reply and best agg partner * did reply, keep best. * * 4. Therefore, current and best both have partner replies or * both do not, so perform selection policy: * * BOND_AD_COUNT: Select by count of ports. If count is equal, * select by bandwidth. * * BOND_AD_STABLE, BOND_AD_BANDWIDTH: Select by bandwidth. */ if (!best) return curr; if (!curr->is_individual && best->is_individual) return curr; if (curr->is_individual && !best->is_individual) return best; if (__agg_has_partner(curr) && !__agg_has_partner(best)) return curr; if (!__agg_has_partner(curr) && __agg_has_partner(best)) return best; switch (__get_agg_selection_mode(curr->lag_ports)) { case BOND_AD_COUNT: if (__agg_active_ports(curr) > __agg_active_ports(best)) return curr; if (__agg_active_ports(curr) < __agg_active_ports(best)) return best; fallthrough; case BOND_AD_STABLE: case BOND_AD_BANDWIDTH: if (__get_agg_bandwidth(curr) > __get_agg_bandwidth(best)) return curr; break; default: net_warn_ratelimited("%s: (slave %s): Impossible agg select mode %d\n", curr->slave->bond->dev->name, curr->slave->dev->name, __get_agg_selection_mode(curr->lag_ports)); break; } return best; } static int agg_device_up(const struct aggregator *agg) { struct port *port = agg->lag_ports; if (!port) return 0; for (port = agg->lag_ports; port; port = port->next_port_in_aggregator) { if (netif_running(port->slave->dev) && netif_carrier_ok(port->slave->dev)) return 1; } return 0; } /** * ad_agg_selection_logic - select an aggregation group for a team * @agg: the aggregator we're looking at * @update_slave_arr: Does slave array need update? * * It is assumed that only one aggregator may be selected for a team. * * The logic of this function is to select the aggregator according to * the ad_select policy: * * BOND_AD_STABLE: select the aggregator with the most ports attached to * it, and to reselect the active aggregator only if the previous * aggregator has no more ports related to it. * * BOND_AD_BANDWIDTH: select the aggregator with the highest total * bandwidth, and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * BOND_AD_COUNT: select the aggregator with largest number of ports * (slaves), and reselect whenever a link state change takes place or the * set of slaves in the bond changes. * * FIXME: this function MUST be called with the first agg in the bond, or * __get_active_agg() won't work correctly. This function should be better * called with the bond itself, and retrieve the first agg from it. */ static void ad_agg_selection_logic(struct aggregator *agg, bool *update_slave_arr) { struct aggregator *best, *active, *origin; struct bonding *bond = agg->slave->bond; struct list_head *iter; struct slave *slave; struct port *port; rcu_read_lock(); origin = agg; active = __get_active_agg(agg); best = (active && agg_device_up(active)) ? active : NULL; bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); agg->is_active = 0; if (__agg_active_ports(agg) && agg_device_up(agg)) best = ad_agg_selection_test(best, agg); } if (best && __get_agg_selection_mode(best->lag_ports) == BOND_AD_STABLE) { /* For the STABLE policy, don't replace the old active * aggregator if it's still active (it has an answering * partner) or if both the best and active don't have an * answering partner. */ if (active && active->lag_ports && __agg_active_ports(active) && (__agg_has_partner(active) || (!__agg_has_partner(active) && !__agg_has_partner(best)))) { if (!(!active->actor_oper_aggregator_key && best->actor_oper_aggregator_key)) { best = NULL; active->is_active = 1; } } } if (best && (best == active)) { best = NULL; active->is_active = 1; } /* if there is new best aggregator, activate it */ if (best) { netdev_dbg(bond->dev, "(slave %s): best Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); netdev_dbg(bond->dev, "(slave %s): best ports %p slave %p\n", best->slave ? best->slave->dev->name : "NULL", best->lag_ports, best->slave); bond_for_each_slave_rcu(bond, slave, iter) { agg = &(SLAVE_AD_INFO(slave)->aggregator); slave_dbg(bond->dev, slave->dev, "Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", agg->aggregator_identifier, agg->num_of_ports, agg->actor_oper_aggregator_key, agg->partner_oper_aggregator_key, agg->is_individual, agg->is_active); } /* check if any partner replies */ if (best->is_individual) net_warn_ratelimited("%s: Warning: No 802.3ad response from the link partner for any adapters in the bond\n", bond->dev->name); best->is_active = 1; netdev_dbg(bond->dev, "(slave %s): LAG %d chosen as the active LAG\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier); netdev_dbg(bond->dev, "(slave %s): Agg=%d; P=%d; a k=%d; p k=%d; Ind=%d; Act=%d\n", best->slave ? best->slave->dev->name : "NULL", best->aggregator_identifier, best->num_of_ports, best->actor_oper_aggregator_key, best->partner_oper_aggregator_key, best->is_individual, best->is_active); /* disable the ports that were related to the former * active_aggregator */ if (active) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __disable_port(port); } } /* Slave array needs update. */ *update_slave_arr = true; } /* if the selected aggregator is of join individuals * (partner_system is NULL), enable their ports */ active = __get_active_agg(origin); if (active) { if (!__agg_has_partner(active)) { for (port = active->lag_ports; port; port = port->next_port_in_aggregator) { __enable_port(port); } *update_slave_arr = true; } } rcu_read_unlock(); bond_3ad_set_carrier(bond); } /** * ad_clear_agg - clear a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_clear_agg(struct aggregator *aggregator) { if (aggregator) { aggregator->is_individual = false; aggregator->actor_admin_aggregator_key = 0; aggregator->actor_oper_aggregator_key = 0; eth_zero_addr(aggregator->partner_system.mac_addr_value); aggregator->partner_system_priority = 0; aggregator->partner_oper_aggregator_key = 0; aggregator->receive_state = 0; aggregator->transmit_state = 0; aggregator->lag_ports = NULL; aggregator->is_active = 0; aggregator->num_of_ports = 0; pr_debug("%s: LAG %d was cleared\n", aggregator->slave ? aggregator->slave->dev->name : "NULL", aggregator->aggregator_identifier); } } /** * ad_initialize_agg - initialize a given aggregator's parameters * @aggregator: the aggregator we're looking at */ static void ad_initialize_agg(struct aggregator *aggregator) { if (aggregator) { ad_clear_agg(aggregator); eth_zero_addr(aggregator->aggregator_mac_address.mac_addr_value); aggregator->aggregator_identifier = 0; aggregator->slave = NULL; } } /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at * @lacp_fast: boolean. whether fast periodic should be used */ static void ad_initialize_port(struct port *port, int lacp_fast) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, .port_state = 1, }; static const struct lacpdu lacpdu = { .subtype = 0x01, .version_number = 0x01, .tlv_type_actor_info = 0x01, .actor_information_length = 0x14, .tlv_type_partner_info = 0x02, .partner_information_length = 0x14, .tlv_type_collector_info = 0x03, .collector_information_length = 0x10, .collector_max_delay = htons(AD_COLLECTOR_MAX_DELAY), }; if (port) { port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; port->actor_admin_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; port->actor_oper_port_state = LACP_STATE_AGGREGATION | LACP_STATE_LACP_ACTIVITY; if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); memcpy(&port->partner_oper, &tmpl, sizeof(tmpl)); port->is_enabled = true; /* private parameters */ port->sm_vars = AD_PORT_BEGIN | AD_PORT_LACP_ENABLED; port->sm_rx_state = 0; port->sm_rx_timer_counter = 0; port->sm_periodic_state = 0; port->sm_periodic_timer_counter = 0; port->sm_mux_state = 0; port->sm_mux_timer_counter = 0; port->sm_tx_state = 0; port->aggregator = NULL; port->next_port_in_aggregator = NULL; port->transaction_id = 0; port->sm_churn_actor_timer_counter = 0; port->sm_churn_actor_state = 0; port->churn_actor_count = 0; port->sm_churn_partner_timer_counter = 0; port->sm_churn_partner_state = 0; port->churn_partner_count = 0; memcpy(&port->lacpdu, &lacpdu, sizeof(lacpdu)); } } /** * ad_enable_collecting - enable a port's receive * @port: the port we're looking at * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting(struct port *port) { if (port->aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_collecting_port(port); } } /** * ad_disable_distributing - disable a port's transmit * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_enable_collecting_distributing - enable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? * * Enable @port if it's in an active aggregator */ static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; } } /** * ad_disable_collecting_distributing - disable a port's transmit/receive * @port: the port we're looking at * @update_slave_arr: Does slave array need update? */ static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { if (port->aggregator && __agg_has_partner(port->aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, port->aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; } } /** * ad_marker_info_received - handle receive of a Marker information frame * @marker_info: Marker info received * @port: the port we're looking at */ static void ad_marker_info_received(struct bond_marker *marker_info, struct port *port) { struct bond_marker marker; atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_rx); /* copy the received marker data to the response marker */ memcpy(&marker, marker_info, sizeof(struct bond_marker)); /* change the marker subtype to marker response */ marker.tlv_type = AD_MARKER_RESPONSE_SUBTYPE; /* send the marker response */ if (ad_marker_send(port, &marker) >= 0) slave_dbg(port->slave->bond->dev, port->slave->dev, "Sent Marker Response on port %d\n", port->actor_port_number); } /** * ad_marker_response_received - handle receive of a marker response frame * @marker: marker PDU received * @port: the port we're looking at * * This function does nothing since we decided not to implement send and handle * response for marker PDU's, in this stage, but only to respond to marker * information. */ static void ad_marker_response_received(struct bond_marker *marker, struct port *port) { atomic64_inc(&SLAVE_AD_INFO(port->slave)->stats.marker_resp_rx); atomic64_inc(&BOND_AD_INFO(port->slave->bond).stats.marker_resp_rx); /* DO NOTHING, SINCE WE DECIDED NOT TO IMPLEMENT THIS FEATURE FOR NOW */ } /* ========= AD exported functions to the main bonding code ========= */ /* Check aggregators status in team every T seconds */ #define AD_AGGREGATOR_SELECTION_TIMER 8 /** * bond_3ad_initiate_agg_selection - initate aggregator selection * @bond: bonding struct * @timeout: timeout value to set * * Set the aggregation selection timer, to initiate an agg selection in * the very near future. Called during first initialization, and during * any down to up transitions of the bond. */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** * bond_3ad_initialize - initialize a bond's 802.3ad parameters and structures * @bond: bonding struct to work on * * Can be called only after the mac address of the bond is set. */ void bond_3ad_initialize(struct bonding *bond) { BOND_AD_INFO(bond).aggregator_identifier = 0; BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); bond_3ad_initiate_agg_selection(bond, AD_AGGREGATOR_SELECTION_TIMER * ad_ticks_per_sec); } /** * bond_3ad_bind_slave - initialize a slave's port * @slave: slave struct to work on * * Returns: 0 on success * < 0 on error */ void bond_3ad_bind_slave(struct slave *slave) { struct bonding *bond = bond_get_bond_by_slave(slave); struct port *port; struct aggregator *aggregator; /* check that the slave has not been initialized yet. */ if (SLAVE_AD_INFO(slave)->port.slave != slave) { /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); ad_initialize_port(port, bond->params.lacp_fast); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and * user key */ port->actor_admin_port_key = bond->params.ad_user_port_key << 6; ad_update_actor_keys(port, false); /* actor system is the bond's system */ __ad_actor_update_port(port); /* tx timer(to verify that no more than MAX_TX_IN_SECOND * lacpdu's are sent in one second) */ port->sm_tx_timer_counter = ad_ticks_per_sec/AD_MAX_TX_IN_SECOND; __disable_port(port); /* aggregator initialization */ aggregator = &(SLAVE_AD_INFO(slave)->aggregator); ad_initialize_agg(aggregator); aggregator->aggregator_mac_address = *((struct mac_addr *)bond->dev->dev_addr); aggregator->aggregator_identifier = ++BOND_AD_INFO(bond).aggregator_identifier; aggregator->slave = slave; aggregator->is_active = 0; aggregator->num_of_ports = 0; } } /** * bond_3ad_unbind_slave - deinitialize a slave's port * @slave: slave struct to work on * * Search for the aggregator that is related to this port, remove the * aggregator and assign another aggregator for other port related to it * (if any), and remove the port. */ void bond_3ad_unbind_slave(struct slave *slave) { struct port *port, *prev_port, *temp_port; struct aggregator *aggregator, *new_aggregator, *temp_aggregator; int select_new_active_agg = 0; struct bonding *bond = slave->bond; struct slave *slave_iter; struct list_head *iter; bool dummy_slave_update; /* Ignore this value as caller updates array */ /* Sync against bond_3ad_state_machine_handler() */ spin_lock_bh(&bond->mode_lock); aggregator = &(SLAVE_AD_INFO(slave)->aggregator); port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(bond->dev, slave->dev, "Trying to unbind an uninitialized port\n"); goto out; } slave_dbg(bond->dev, slave->dev, "Unbinding Link Aggregation Group %d\n", aggregator->aggregator_identifier); /* Tell the partner that this port is not suitable for aggregation */ port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; port->actor_oper_port_state &= ~LACP_STATE_COLLECTING; port->actor_oper_port_state &= ~LACP_STATE_DISTRIBUTING; port->actor_oper_port_state &= ~LACP_STATE_AGGREGATION; __update_lacpdu_from_port(port); ad_lacpdu_send(port); /* check if this aggregator is occupied */ if (aggregator->lag_ports) { /* check if there are other ports related to this aggregator * except the port related to this slave(thats ensure us that * there is a reason to search for new aggregator, and that we * will find one */ if ((aggregator->lag_ports != port) || (aggregator->lag_ports->next_port_in_aggregator)) { /* find new aggregator for the related port(s) */ bond_for_each_slave(bond, slave_iter, iter) { new_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); /* if the new aggregator is empty, or it is * connected to our port only */ if (!new_aggregator->lag_ports || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator)) break; } if (!slave_iter) new_aggregator = NULL; /* if new aggregator found, copy the aggregator's * parameters and connect the related lag_ports to the * new aggregator */ if ((new_aggregator) && ((!new_aggregator->lag_ports) || ((new_aggregator->lag_ports == port) && !new_aggregator->lag_ports->next_port_in_aggregator))) { slave_dbg(bond->dev, slave->dev, "Some port(s) related to LAG %d - replacing with LAG %d\n", aggregator->aggregator_identifier, new_aggregator->aggregator_identifier); if ((new_aggregator->lag_ports == port) && new_aggregator->is_active) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); select_new_active_agg = 1; } new_aggregator->is_individual = aggregator->is_individual; new_aggregator->actor_admin_aggregator_key = aggregator->actor_admin_aggregator_key; new_aggregator->actor_oper_aggregator_key = aggregator->actor_oper_aggregator_key; new_aggregator->partner_system = aggregator->partner_system; new_aggregator->partner_system_priority = aggregator->partner_system_priority; new_aggregator->partner_oper_aggregator_key = aggregator->partner_oper_aggregator_key; new_aggregator->receive_state = aggregator->receive_state; new_aggregator->transmit_state = aggregator->transmit_state; new_aggregator->lag_ports = aggregator->lag_ports; new_aggregator->is_active = aggregator->is_active; new_aggregator->num_of_ports = aggregator->num_of_ports; /* update the information that is written on * the ports about the aggregator */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { temp_port->aggregator = new_aggregator; temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } ad_clear_agg(aggregator); if (select_new_active_agg) ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } else { slave_warn(bond->dev, slave->dev, "unbinding aggregator, and could not find a new aggregator for its ports\n"); } } else { /* in case that the only port related to this * aggregator is the one we want to remove */ select_new_active_agg = aggregator->is_active; ad_clear_agg(aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ temp_aggregator = __get_first_agg(port); if (temp_aggregator) ad_agg_selection_logic(temp_aggregator, &dummy_slave_update); } } } slave_dbg(bond->dev, slave->dev, "Unbinding port %d\n", port->actor_port_number); /* find the aggregator that this port is connected to */ bond_for_each_slave(bond, slave_iter, iter) { temp_aggregator = &(SLAVE_AD_INFO(slave_iter)->aggregator); prev_port = NULL; /* search the port in the aggregator's related ports */ for (temp_port = temp_aggregator->lag_ports; temp_port; prev_port = temp_port, temp_port = temp_port->next_port_in_aggregator) { if (temp_port == port) { /* the aggregator found - detach the port from * this aggregator */ if (prev_port) prev_port->next_port_in_aggregator = temp_port->next_port_in_aggregator; else temp_aggregator->lag_ports = temp_port->next_port_in_aggregator; temp_aggregator->num_of_ports--; if (__agg_active_ports(temp_aggregator) == 0) { select_new_active_agg = temp_aggregator->is_active; if (temp_aggregator->num_of_ports == 0) ad_clear_agg(temp_aggregator); if (select_new_active_agg) { slave_info(bond->dev, slave->dev, "Removing an active aggregator\n"); /* select new active aggregator */ ad_agg_selection_logic(__get_first_agg(port), &dummy_slave_update); } } break; } } } port->slave = NULL; out: spin_unlock_bh(&bond->mode_lock); } /** * bond_3ad_update_ad_actor_settings - reflect change of actor settings to ports * @bond: bonding struct to work on * * If an ad_actor setting gets changed we need to update the individual port * settings so the bond device will use the new values when it gets upped. */ void bond_3ad_update_ad_actor_settings(struct bonding *bond) { struct list_head *iter; struct slave *slave; ASSERT_RTNL(); BOND_AD_INFO(bond).system.sys_priority = bond->params.ad_actor_sys_prio; if (is_zero_ether_addr(bond->params.ad_actor_system)) BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->dev->dev_addr); else BOND_AD_INFO(bond).system.sys_mac_addr = *((struct mac_addr *)bond->params.ad_actor_system); spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { struct port *port = &(SLAVE_AD_INFO(slave))->port; __ad_actor_update_port(port); port->ntt = true; } spin_unlock_bh(&bond->mode_lock); } /** * bond_agg_timer_advance - advance agg_select_timer * @bond: bonding structure * * Return true when agg_select_timer reaches 0. */ static bool bond_agg_timer_advance(struct bonding *bond) { int val, nval; while (1) { val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); if (!val) return false; nval = val - 1; if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, val, nval) == val) break; } return nval == 0; } /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from * * The state machine handling concept in this module is to check every tick * which state machine should operate any function. The execution order is * round robin, so when we have an interaction between state machines, the * reply of one to each other might be delayed until next tick. * * This function also complete the initialization when the agg_select_timer * times out, and it selects an aggregator for the ports that are yet not * related to any aggregator, and selects the active aggregator for a bond. */ void bond_3ad_state_machine_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, ad_work.work); struct aggregator *aggregator; struct list_head *iter; struct slave *slave; struct port *port; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; bool update_slave_arr = false; /* Lock to protect data accessed by all (e.g., port->sm_vars) and * against running with bond_3ad_unbind_slave. ad_rx_machine may run * concurrently due to incoming LACPDU as well. */ spin_lock_bh(&bond->mode_lock); rcu_read_lock(); /* check if there are any slaves */ if (!bond_has_slaves(bond)) goto re_arm; if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; /* select the active aggregator for the bond */ if (port) { if (!port->slave) { net_warn_ratelimited("%s: Warning: bond's first port is uninitialized\n", bond->dev->name); goto re_arm; } aggregator = __get_first_agg(port); ad_agg_selection_logic(aggregator, &update_slave_arr); } bond_3ad_set_carrier(bond); } /* for each port run the state machines */ bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: Found an uninitialized port\n", bond->dev->name); goto re_arm; } ad_rx_machine(NULL, port); ad_periodic_machine(port, &bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); ad_churn_machine(port); /* turn off the BEGIN bit, since we already handled it */ if (port->sm_vars & AD_PORT_BEGIN) port->sm_vars &= ~AD_PORT_BEGIN; } re_arm: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } rcu_read_unlock(); spin_unlock_bh(&bond->mode_lock); if (update_slave_arr) bond_slave_arr_work_rearm(bond, 0); if (should_notify_rtnl && rtnl_trylock()) { bond_slave_state_notify(bond); rtnl_unlock(); } queue_delayed_work(bond->wq, &bond->ad_work, ad_delta_in_ticks); } /** * bond_3ad_rx_indication - handle a received frame * @lacpdu: received lacpdu * @slave: slave struct to work on * * It is assumed that frames that were sent on this NIC don't returned as new * received frames (loopback). Since only the payload is given to this * function, it check for loopback. */ static int bond_3ad_rx_indication(struct lacpdu *lacpdu, struct slave *slave) { struct bonding *bond = slave->bond; int ret = RX_HANDLER_ANOTHER; struct bond_marker *marker; struct port *port; atomic64_t *stat; port = &(SLAVE_AD_INFO(slave)->port); if (!port->slave) { net_warn_ratelimited("%s: Warning: port of slave %s is uninitialized\n", slave->dev->name, slave->bond->dev->name); return ret; } switch (lacpdu->subtype) { case AD_TYPE_LACPDU: ret = RX_HANDLER_CONSUMED; slave_dbg(slave->bond->dev, slave->dev, "Received LACPDU on port %d\n", port->actor_port_number); /* Protect against concurrent state machines */ spin_lock(&slave->bond->mode_lock); ad_rx_machine(lacpdu, port); spin_unlock(&slave->bond->mode_lock); break; case AD_TYPE_MARKER: ret = RX_HANDLER_CONSUMED; /* No need to convert fields to Little Endian since we * don't use the marker's fields. */ marker = (struct bond_marker *)lacpdu; switch (marker->tlv_type) { case AD_MARKER_INFORMATION_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Information on port %d\n", port->actor_port_number); ad_marker_info_received(marker, port); break; case AD_MARKER_RESPONSE_SUBTYPE: slave_dbg(slave->bond->dev, slave->dev, "Received Marker Response on port %d\n", port->actor_port_number); ad_marker_response_received(marker, port); break; default: slave_dbg(slave->bond->dev, slave->dev, "Received an unknown Marker subtype on port %d\n", port->actor_port_number); stat = &SLAVE_AD_INFO(slave)->stats.marker_unknown_rx; atomic64_inc(stat); stat = &BOND_AD_INFO(bond).stats.marker_unknown_rx; atomic64_inc(stat); } break; default: atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_unknown_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_unknown_rx); } return ret; } /** * ad_update_actor_keys - Update the oper / admin keys for a port based on * its current speed and duplex settings. * * @port: the port we'are looking at * @reset: Boolean to just reset the speed and the duplex part of the key * * The logic to change the oper / admin keys is: * (a) A full duplex port can participate in LACP with partner. * (b) When the speed is changed, LACP need to be reinitiated. */ static void ad_update_actor_keys(struct port *port, bool reset) { u8 duplex = 0; u16 ospeed = 0, speed = 0; u16 old_oper_key = port->actor_oper_port_key; port->actor_admin_port_key &= ~(AD_SPEED_KEY_MASKS|AD_DUPLEX_KEY_MASKS); if (!reset) { speed = __get_link_speed(port); ospeed = (old_oper_key & AD_SPEED_KEY_MASKS) >> 1; duplex = __get_duplex(port); port->actor_admin_port_key |= (speed << 1) | duplex; } port->actor_oper_port_key = port->actor_admin_port_key; if (old_oper_key != port->actor_oper_port_key) { /* Only 'duplex' port participates in LACP */ if (duplex) port->sm_vars |= AD_PORT_LACP_ENABLED; else port->sm_vars &= ~AD_PORT_LACP_ENABLED; if (!reset) { if (!speed) { slave_err(port->slave->bond->dev, port->slave->dev, "speed changed to 0 on port %d\n", port->actor_port_number); } else if (duplex && ospeed != speed) { /* Speed change restarts LACP state-machine */ port->sm_vars |= AD_PORT_BEGIN; } } } } /** * bond_3ad_adapter_speed_duplex_changed - handle a slave's speed / duplex * change indication * * @slave: slave struct to work on * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) { struct port *port; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "speed/duplex changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed speed/duplex\n", port->actor_port_number); } /** * bond_3ad_handle_link_change - handle a slave's link status change indication * @slave: slave struct to work on * @link: whether the link is now up or down * * Handle reselection of aggregator (if needed) for this port. */ void bond_3ad_handle_link_change(struct slave *slave, char link) { struct aggregator *agg; struct port *port; bool dummy; port = &(SLAVE_AD_INFO(slave)->port); /* if slave is null, the whole port is not initialized */ if (!port->slave) { slave_warn(slave->bond->dev, slave->dev, "link status changed for uninitialized port\n"); return; } spin_lock_bh(&slave->bond->mode_lock); /* on link down we are zeroing duplex and speed since * some of the adaptors(ce1000.lan) report full duplex/speed * instead of N/A(duplex) / 0(speed). * * on link up we are forcing recheck on the duplex and speed since * some of he adaptors(ce1000.lan) report. */ if (link == BOND_LINK_UP) { port->is_enabled = true; ad_update_actor_keys(port, false); } else { /* link has failed */ port->is_enabled = false; ad_update_actor_keys(port, true); } agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); spin_unlock_bh(&slave->bond->mode_lock); slave_dbg(slave->bond->dev, slave->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ bond_update_slave_arr(slave->bond, NULL); } /** * bond_3ad_set_carrier - set link state for bonding master * @bond: bonding structure * * if we have an active aggregator, we're up, if not, we're down. * Presumes that we cannot have an active aggregator if there are * no slaves with link up. * * This behavior complies with IEEE 802.3 section 43.3.9. * * Called by bond_set_carrier(). Return zero if carrier state does not * change, nonzero if it does. */ int bond_3ad_set_carrier(struct bonding *bond) { struct aggregator *active; struct slave *first_slave; int ret = 1; rcu_read_lock(); first_slave = bond_first_slave_rcu(bond); if (!first_slave) { ret = 0; goto out; } active = __get_active_agg(&(SLAVE_AD_INFO(first_slave)->aggregator)); if (active) { /* are enough slaves available to consider link up? */ if (__agg_active_ports(active) < bond->params.min_links) { if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); goto out; } } else if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); goto out; } } else if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); } out: rcu_read_unlock(); return ret; } /** * __bond_3ad_get_active_agg_info - get information of the active aggregator * @bond: bonding struct to work on * @ad_info: ad_info struct to fill with the bond's info * * Returns: 0 on success * < 0 on error */ int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { struct aggregator *aggregator = NULL; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (port->aggregator && port->aggregator->is_active) { aggregator = port->aggregator; break; } } if (!aggregator) return -1; ad_info->aggregator_id = aggregator->aggregator_identifier; ad_info->ports = __agg_active_ports(aggregator); ad_info->actor_key = aggregator->actor_oper_aggregator_key; ad_info->partner_key = aggregator->partner_oper_aggregator_key; ether_addr_copy(ad_info->partner_system, aggregator->partner_system.mac_addr_value); return 0; } int bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { int ret; rcu_read_lock(); ret = __bond_3ad_get_active_agg_info(bond, ad_info); rcu_read_unlock(); return ret; } int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct lacpdu *lacpdu, _lacpdu; if (skb->protocol != PKT_TYPE_LACPDU) return RX_HANDLER_ANOTHER; if (!MAC_ADDRESS_EQUAL(eth_hdr(skb)->h_dest, lacpdu_mcast_addr)) return RX_HANDLER_ANOTHER; lacpdu = skb_header_pointer(skb, 0, sizeof(_lacpdu), &_lacpdu); if (!lacpdu) { atomic64_inc(&SLAVE_AD_INFO(slave)->stats.lacpdu_illegal_rx); atomic64_inc(&BOND_AD_INFO(bond).stats.lacpdu_illegal_rx); return RX_HANDLER_ANOTHER; } return bond_3ad_rx_indication(lacpdu, slave); } /** * bond_3ad_update_lacp_rate - change the lacp rate * @bond: bonding struct * * When modify lacp_rate parameter via sysfs, * update actor_oper_port_state of each port. * * Hold bond->mode_lock, * so we can modify port->actor_oper_port_state, * no matter bond is up or down. */ void bond_3ad_update_lacp_rate(struct bonding *bond) { struct port *port = NULL; struct list_head *iter; struct slave *slave; int lacp_fast; lacp_fast = bond->params.lacp_fast; spin_lock_bh(&bond->mode_lock); bond_for_each_slave(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); if (lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; else port->actor_oper_port_state &= ~LACP_STATE_LACP_TIMEOUT; } spin_unlock_bh(&bond->mode_lock); } size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ } int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) { u64 val; val = atomic64_read(&stats->lacpdu_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->lacpdu_illegal_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_resp_tx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; val = atomic64_read(&stats->marker_unknown_rx); if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, BOND_3AD_STAT_PAD)) return -EMSGSIZE; return 0; }
3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/sfp.h> #include "netlink.h" #include "common.h" struct eeprom_req_info { struct ethnl_req_info base; u32 offset; u32 length; u8 page; u8 bank; u8 i2c_address; }; struct eeprom_reply_data { struct ethnl_reply_data base; u32 length; u8 *data; }; #define MODULE_EEPROM_REQINFO(__req_base) \ container_of(__req_base, struct eeprom_req_info, base) #define MODULE_EEPROM_REPDATA(__reply_base) \ container_of(__reply_base, struct eeprom_reply_data, base) static int fallback_set_params(struct eeprom_req_info *request, struct ethtool_modinfo *modinfo, struct ethtool_eeprom *eeprom) { u32 offset = request->offset; u32 length = request->length; if (request->page) offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset; if (modinfo->type == ETH_MODULE_SFF_8472 && request->i2c_address == 0x51) offset += ETH_MODULE_EEPROM_PAGE_LEN * 2; if (offset >= modinfo->eeprom_len) return -EINVAL; eeprom->cmd = ETHTOOL_GMODULEEEPROM; eeprom->len = length; eeprom->offset = offset; return 0; } static int eeprom_fallback(struct eeprom_req_info *request, struct eeprom_reply_data *reply) { struct net_device *dev = reply->base.dev; struct ethtool_modinfo modinfo = {0}; struct ethtool_eeprom eeprom = {0}; u8 *data; int err; modinfo.cmd = ETHTOOL_GMODULEINFO; err = ethtool_get_module_info_call(dev, &modinfo); if (err < 0) return err; err = fallback_set_params(request, &modinfo, &eeprom); if (err < 0) return err; data = kmalloc(eeprom.len, GFP_KERNEL); if (!data) return -ENOMEM; err = ethtool_get_module_eeprom_call(dev, &eeprom, data); if (err < 0) goto err_out; reply->data = data; reply->length = eeprom.len; return 0; err_out: kfree(data); return err; } static int get_module_eeprom_by_page(struct net_device *dev, struct ethtool_module_eeprom *page_data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } if (dev->sfp_bus) return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack); if (ops->get_module_eeprom_by_page) return ops->get_module_eeprom_by_page(dev, page_data, extack); return -EOPNOTSUPP; } static int eeprom_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); struct ethtool_module_eeprom page_data = {0}; struct net_device *dev = reply_base->dev; int ret; page_data.offset = request->offset; page_data.length = request->length; page_data.i2c_address = request->i2c_address; page_data.page = request->page; page_data.bank = request->bank; page_data.data = kmalloc(page_data.length, GFP_KERNEL); if (!page_data.data) return -ENOMEM; ret = ethnl_ops_begin(dev); if (ret) goto err_free; ret = get_module_eeprom_by_page(dev, &page_data, info->extack); if (ret < 0) goto err_ops; reply->length = ret; reply->data = page_data.data; ethnl_ops_complete(dev); return 0; err_ops: ethnl_ops_complete(dev); err_free: kfree(page_data.data); if (ret == -EOPNOTSUPP) return eeprom_fallback(request, reply); return ret; } static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack) { struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) return -EINVAL; request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); /* The following set of conditions limit the API to only dump 1/2 * EEPROM page without crossing low page boundary located at offset 128. * This means user may only request dumps of length limited to 128 from * either low 128 bytes or high 128 bytes. * For pages higher than 0 only high 128 bytes are accessible. */ request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], "reading from lower half page is allowed for page 0 only"); return -EINVAL; } if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross half page boundary is illegal"); return -EINVAL; } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], "reading cross page boundary is illegal"); return -EINVAL; } if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); return 0; } static int eeprom_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ } static int eeprom_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); } static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) { struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); kfree(reply->data); } const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, .req_info_size = sizeof(struct eeprom_req_info), .reply_data_size = sizeof(struct eeprom_reply_data), .parse_request = eeprom_parse_request, .prepare_data = eeprom_prepare_data, .reply_size = eeprom_reply_size, .fill_reply = eeprom_fill_reply, .cleanup_data = eeprom_cleanup_data, }; const struct nla_policy ethnl_module_eeprom_get_policy[] = { [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_EEPROM_OFFSET] = NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1), [ETHTOOL_A_MODULE_EEPROM_LENGTH] = NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN), [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), };
24 26 24 24 24 24 18 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 // SPDX-License-Identifier: GPL-2.0-only /* * vhost transport for vsock * * Copyright (C) 2013-2015 Red Hat, Inc. * Author: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> */ #include <linux/miscdevice.h> #include <linux/atomic.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <net/sock.h> #include <linux/virtio_vsock.h> #include <linux/vhost.h> #include <linux/hashtable.h> #include <net/af_vsock.h> #include "vhost.h" #define VHOST_VSOCK_DEFAULT_HOST_CID 2 /* Max number of bytes transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others. */ #define VHOST_VSOCK_WEIGHT 0x80000 /* Max number of packets transferred before requeueing the job. * Using this limit prevents one virtqueue from starving others with * small pkts. */ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { VHOST_VSOCK_FEATURES = VHOST_FEATURES | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_VSOCK_F_SEQPACKET) }; enum { VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ static DEFINE_MUTEX(vhost_vsock_mutex); static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8); struct vhost_vsock { struct vhost_dev dev; struct vhost_virtqueue vqs[2]; /* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */ struct hlist_node hash; struct vhost_work send_pkt_work; struct sk_buff_head send_pkt_queue; /* host->guest pending packets */ atomic_t queued_replies; u32 guest_cid; bool seqpacket_allow; }; static u32 vhost_transport_get_local_cid(void) { return VHOST_VSOCK_DEFAULT_HOST_CID; } /* Callers that dereference the return value must hold vhost_vsock_mutex or the * RCU read lock. */ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid) { struct vhost_vsock *vsock; hash_for_each_possible_rcu(vhost_vsock_hash, vsock, hash, guest_cid) { u32 other_cid = vsock->guest_cid; /* Skip instances that have no CID yet */ if (other_cid == 0) continue; if (other_cid == guest_cid) return vsock; } return NULL; } static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int pkts = 0, total_len = 0; bool added = false; bool restart_tx = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; size_t iov_len, payload_len; struct iov_iter iov_iter; u32 flags_to_restore = 0; struct sk_buff *skb; unsigned out, in; size_t nbytes; u32 offset; int head; skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); if (!skb) { vhost_enable_notify(&vsock->dev, vq); break; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); break; } if (head == vq->num) { virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); /* We cannot finish yet if more buffers snuck in while * re-enabling notify. */ if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } if (out) { kfree_skb(skb); vq_err(vq, "Expected 0 output buffers, got %u\n", out); break; } iov_len = iov_length(&vq->iov[out], in); if (iov_len < sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Buffer len [%zu] too small\n", iov_len); break; } iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); offset = VIRTIO_VSOCK_SKB_CB(skb)->offset; payload_len = skb->len - offset; hdr = virtio_vsock_hdr(skb); /* If the packet is greater than the space available in the * buffer, we split it using multiple buffers. */ if (payload_len > iov_len - sizeof(*hdr)) { payload_len = iov_len - sizeof(*hdr); /* As we are copying pieces of large packet's buffer to * small rx buffers, headers of packets in rx queue are * created dynamically and are initialized with header * of current packet(except length). But in case of * SOCK_SEQPACKET, we also must clear message delimeter * bit (VIRTIO_VSOCK_SEQ_EOM) and MSG_EOR bit * (VIRTIO_VSOCK_SEQ_EOR) if set. Otherwise, * there will be sequence of packets with these * bits set. After initialized header will be copied to * rx buffer, these required bits will be restored. */ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM; if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) { hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR; } } } /* Set the correct length in the header */ hdr->len = cpu_to_le32(payload_len); nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt hdr\n"); break; } if (skb_copy_datagram_iter(skb, offset, &iov_iter, payload_len)) { kfree_skb(skb); vq_err(vq, "Faulted on copying pkt buf\n"); break; } /* Deliver to monitoring devices all packets that we * will transmit. */ virtio_transport_deliver_tap_pkt(skb); vhost_add_used(vq, head, sizeof(*hdr) + payload_len); added = true; VIRTIO_VSOCK_SKB_CB(skb)->offset += payload_len; total_len += payload_len; /* If we didn't send all the payload we can requeue the packet * to send it with the next available buffer. */ if (VIRTIO_VSOCK_SKB_CB(skb)->offset < skb->len) { hdr->flags |= cpu_to_le32(flags_to_restore); /* We are queueing the same skb to handle * the remaining bytes, and we want to deliver it * to monitoring devices in the next iteration. */ virtio_vsock_skb_clear_tap_delivered(skb); virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); } else { if (virtio_vsock_skb_reply(skb)) { int val; val = atomic_dec_return(&vsock->queued_replies); /* Do we have resources to resume tx * processing? */ if (val + 1 == tx_vq->num) restart_tx = true; } virtio_transport_consume_skb_sent(skb, true); } } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); if (restart_tx) vhost_poll_queue(&tx_vq->poll); } static void vhost_transport_send_pkt_work(struct vhost_work *work) { struct vhost_virtqueue *vq; struct vhost_vsock *vsock; vsock = container_of(work, struct vhost_vsock, send_pkt_work); vq = &vsock->vqs[VSOCK_VQ_RX]; vhost_transport_do_send_pkt(vsock, vq); } static int vhost_transport_send_pkt(struct sk_buff *skb) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct vhost_vsock *vsock; int len = skb->len; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid)); if (!vsock) { rcu_read_unlock(); kfree_skb(skb); return -ENODEV; } if (virtio_vsock_skb_reply(skb)) atomic_inc(&vsock->queued_replies); virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); rcu_read_unlock(); return len; } static int vhost_transport_cancel_pkt(struct vsock_sock *vsk) { struct vhost_vsock *vsock; int cnt = 0; int ret = -ENODEV; rcu_read_lock(); /* Find the vhost_vsock according to guest context id */ vsock = vhost_vsock_get(vsk->remote_addr.svm_cid); if (!vsock) goto out; cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); if (cnt) { struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; int new_cnt; new_cnt = atomic_sub_return(cnt, &vsock->queued_replies); if (new_cnt + cnt >= tx_vq->num && new_cnt < tx_vq->num) vhost_poll_queue(&tx_vq->poll); } ret = 0; out: rcu_read_unlock(); return ret; } static struct sk_buff * vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, unsigned int out, unsigned int in) { struct virtio_vsock_hdr *hdr; struct iov_iter iov_iter; struct sk_buff *skb; size_t payload_len; size_t nbytes; size_t len; if (in != 0) { vq_err(vq, "Expected 0 input buffers, got %u\n", in); return NULL; } len = iov_length(vq->iov, out); /* len contains both payload and hdr */ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); if (!skb) return NULL; iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len); hdr = virtio_vsock_hdr(skb); nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter); if (nbytes != sizeof(*hdr)) { vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", sizeof(*hdr), nbytes); kfree_skb(skb); return NULL; } payload_len = le32_to_cpu(hdr->len); /* No payload */ if (!payload_len) return skb; /* The pkt is too big or the length in the header is invalid */ if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || payload_len + sizeof(*hdr) > len) { kfree_skb(skb); return NULL; } virtio_vsock_skb_rx_put(skb); nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); if (nbytes != payload_len) { vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", payload_len, nbytes); kfree_skb(skb); return NULL; } return skb; } /* Is there space left for replies to rx packets? */ static bool vhost_vsock_more_replies(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq = &vsock->vqs[VSOCK_VQ_TX]; int val; smp_rmb(); /* paired with atomic_inc() and atomic_dec_return() */ val = atomic_read(&vsock->queued_replies); return val < vq->num; } static bool vhost_transport_msgzerocopy_allow(void) { return true; } static bool vhost_transport_seqpacket_allow(u32 remote_cid); static struct virtio_transport vhost_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vhost_transport_cancel_pkt, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_bind = virtio_transport_dgram_bind, .dgram_allow = virtio_transport_dgram_allow, .stream_enqueue = virtio_transport_stream_enqueue, .stream_dequeue = virtio_transport_stream_dequeue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vhost_transport_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vhost_transport_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .unsent_bytes = virtio_transport_unsent_bytes, .read_skb = virtio_transport_read_skb, }, .send_pkt = vhost_transport_send_pkt, }; static bool vhost_transport_seqpacket_allow(u32 remote_cid) { struct vhost_vsock *vsock; bool seqpacket_allow = false; rcu_read_lock(); vsock = vhost_vsock_get(remote_cid); if (vsock) seqpacket_allow = vsock->seqpacket_allow; rcu_read_unlock(); return seqpacket_allow; } static void vhost_vsock_handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); int head, pkts = 0, total_len = 0; unsigned int out, in; struct sk_buff *skb; bool added = false; mutex_lock(&vq->mutex); if (!vhost_vq_get_backend(vq)) goto out; if (!vq_meta_prefetch(vq)) goto out; vhost_disable_notify(&vsock->dev, vq); do { struct virtio_vsock_hdr *hdr; if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue * callbacks disabled. */ goto no_more_replies; } head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); if (head < 0) break; if (head == vq->num) { if (unlikely(vhost_enable_notify(&vsock->dev, vq))) { vhost_disable_notify(&vsock->dev, vq); continue; } break; } skb = vhost_vsock_alloc_skb(vq, out, in); if (!skb) { vq_err(vq, "Faulted on pkt\n"); continue; } total_len += sizeof(*hdr) + skb->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(skb); hdr = virtio_vsock_hdr(skb); /* Only accept correctly addressed packets */ if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid && le64_to_cpu(hdr->dst_cid) == vhost_transport_get_local_cid()) virtio_transport_recv_pkt(&vhost_transport, skb); else kfree_skb(skb); vhost_add_used(vq, head, 0); added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); no_more_replies: if (added) vhost_signal(&vsock->dev, vq); out: mutex_unlock(&vq->mutex); } static void vhost_vsock_handle_rx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, dev); vhost_transport_do_send_pkt(vsock, vq); } static int vhost_vsock_start(struct vhost_vsock *vsock) { struct vhost_virtqueue *vq; size_t i; int ret; mutex_lock(&vsock->dev.mutex); ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); if (!vhost_vq_access_ok(vq)) { ret = -EFAULT; goto err_vq; } if (!vhost_vq_get_backend(vq)) { vhost_vq_set_backend(vq, vsock); ret = vhost_vq_init_access(vq); if (ret) goto err_vq; } mutex_unlock(&vq->mutex); } /* Some packets may have been queued before the device was started, * let's kick the send worker to send them. */ vhost_vq_work_queue(&vsock->vqs[VSOCK_VQ_RX], &vsock->send_pkt_work); mutex_unlock(&vsock->dev.mutex); return 0; err_vq: vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static int vhost_vsock_stop(struct vhost_vsock *vsock, bool check_owner) { size_t i; int ret = 0; mutex_lock(&vsock->dev.mutex); if (check_owner) { ret = vhost_dev_check_owner(&vsock->dev); if (ret) goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { struct vhost_virtqueue *vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vhost_vq_set_backend(vq, NULL); mutex_unlock(&vq->mutex); } err: mutex_unlock(&vsock->dev.mutex); return ret; } static void vhost_vsock_free(struct vhost_vsock *vsock) { kvfree(vsock); } static int vhost_vsock_dev_open(struct inode *inode, struct file *file) { struct vhost_virtqueue **vqs; struct vhost_vsock *vsock; int ret; /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!vsock) return -ENOMEM; vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); if (!vqs) { ret = -ENOMEM; goto out; } vsock->guest_cid = 0; /* no CID assigned yet */ vsock->seqpacket_allow = false; atomic_set(&vsock->queued_replies, 0); vqs[VSOCK_VQ_TX] = &vsock->vqs[VSOCK_VQ_TX]; vqs[VSOCK_VQ_RX] = &vsock->vqs[VSOCK_VQ_RX]; vsock->vqs[VSOCK_VQ_TX].handle_kick = vhost_vsock_handle_tx_kick; vsock->vqs[VSOCK_VQ_RX].handle_kick = vhost_vsock_handle_rx_kick; vhost_dev_init(&vsock->dev, vqs, ARRAY_SIZE(vsock->vqs), UIO_MAXIOV, VHOST_VSOCK_PKT_WEIGHT, VHOST_VSOCK_WEIGHT, true, NULL); file->private_data = vsock; skb_queue_head_init(&vsock->send_pkt_queue); vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); return 0; out: vhost_vsock_free(vsock); return ret; } static void vhost_vsock_flush(struct vhost_vsock *vsock) { vhost_dev_flush(&vsock->dev); } static void vhost_vsock_reset_orphans(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); /* vmci_transport.c doesn't take sk_lock here either. At least we're * under vsock_table_lock so the sock cannot disappear while we're * executing. */ /* If the peer is still valid, no need to reset connection */ if (vhost_vsock_get(vsk->remote_addr.svm_cid)) return; /* If the close timeout is pending, let it expire. This avoids races * with the timeout callback. */ if (vsk->close_work_scheduled) return; sock_set_flag(sk, SOCK_DONE); vsk->peer_shutdown = SHUTDOWN_MASK; sk->sk_state = SS_UNCONNECTED; sk->sk_err = ECONNRESET; sk_error_report(sk); } static int vhost_vsock_dev_release(struct inode *inode, struct file *file) { struct vhost_vsock *vsock = file->private_data; mutex_lock(&vhost_vsock_mutex); if (vsock->guest_cid) hash_del_rcu(&vsock->hash); mutex_unlock(&vhost_vsock_mutex); /* Wait for other CPUs to finish using vsock */ synchronize_rcu(); /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ vsock_for_each_connected_socket(&vhost_transport.transport, vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. * vhost_vsock_stop() can not fail in this case, so we don't need to * check the return code. */ vhost_vsock_stop(vsock, false); vhost_vsock_flush(vsock); vhost_dev_stop(&vsock->dev); virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); vhost_dev_cleanup(&vsock->dev); kfree(vsock->dev.vqs); vhost_vsock_free(vsock); return 0; } static int vhost_vsock_set_cid(struct vhost_vsock *vsock, u64 guest_cid) { struct vhost_vsock *other; /* Refuse reserved CIDs */ if (guest_cid <= VMADDR_CID_HOST || guest_cid == U32_MAX) return -EINVAL; /* 64-bit CIDs are not yet supported */ if (guest_cid > U32_MAX) return -EINVAL; /* Refuse if CID is assigned to the guest->host transport (i.e. nested * VM), to make the loopback work. */ if (vsock_find_cid(guest_cid)) return -EADDRINUSE; /* Refuse if CID is already in use */ mutex_lock(&vhost_vsock_mutex); other = vhost_vsock_get(guest_cid); if (other && other != vsock) { mutex_unlock(&vhost_vsock_mutex); return -EADDRINUSE; } if (vsock->guest_cid) hash_del_rcu(&vsock->hash); vsock->guest_cid = guest_cid; hash_add_rcu(vhost_vsock_hash, &vsock->hash, vsock->guest_cid); mutex_unlock(&vhost_vsock_mutex); return 0; } static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) { struct vhost_virtqueue *vq; int i; if (features & ~VHOST_VSOCK_FEATURES) return -EOPNOTSUPP; mutex_lock(&vsock->dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(&vsock->dev)) { goto err; } if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { if (vhost_init_device_iotlb(&vsock->dev)) goto err; } vsock->seqpacket_allow = features & (1ULL << VIRTIO_VSOCK_F_SEQPACKET); for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { vq = &vsock->vqs[i]; mutex_lock(&vq->mutex); vq->acked_features = features; mutex_unlock(&vq->mutex); } mutex_unlock(&vsock->dev.mutex); return 0; err: mutex_unlock(&vsock->dev.mutex); return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { struct vhost_vsock *vsock = f->private_data; void __user *argp = (void __user *)arg; u64 guest_cid; u64 features; int start; int r; switch (ioctl) { case VHOST_VSOCK_SET_GUEST_CID: if (copy_from_user(&guest_cid, argp, sizeof(guest_cid))) return -EFAULT; return vhost_vsock_set_cid(vsock, guest_cid); case VHOST_VSOCK_SET_RUNNING: if (copy_from_user(&start, argp, sizeof(start))) return -EFAULT; if (start) return vhost_vsock_start(vsock); else return vhost_vsock_stop(vsock, true); case VHOST_GET_FEATURES: features = VHOST_VSOCK_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); case VHOST_GET_BACKEND_FEATURES: features = VHOST_VSOCK_BACKEND_FEATURES; if (copy_to_user(argp, &features, sizeof(features))) return -EFAULT; return 0; case VHOST_SET_BACKEND_FEATURES: if (copy_from_user(&features, argp, sizeof(features))) return -EFAULT; if (features & ~VHOST_VSOCK_BACKEND_FEATURES) return -EOPNOTSUPP; vhost_set_backend_features(&vsock->dev, features); return 0; default: mutex_lock(&vsock->dev.mutex); r = vhost_dev_ioctl(&vsock->dev, ioctl, argp); if (r == -ENOIOCTLCMD) r = vhost_vring_ioctl(&vsock->dev, ioctl, argp); else vhost_vsock_flush(vsock); mutex_unlock(&vsock->dev.mutex); return r; } } static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; int noblock = file->f_flags & O_NONBLOCK; return vhost_chr_read_iter(dev, to, noblock); } static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_write_iter(dev, from); } static __poll_t vhost_vsock_chr_poll(struct file *file, poll_table *wait) { struct vhost_vsock *vsock = file->private_data; struct vhost_dev *dev = &vsock->dev; return vhost_chr_poll(file, dev, wait); } static const struct file_operations vhost_vsock_fops = { .owner = THIS_MODULE, .open = vhost_vsock_dev_open, .release = vhost_vsock_dev_release, .llseek = noop_llseek, .unlocked_ioctl = vhost_vsock_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, .read_iter = vhost_vsock_chr_read_iter, .write_iter = vhost_vsock_chr_write_iter, .poll = vhost_vsock_chr_poll, }; static struct miscdevice vhost_vsock_misc = { .minor = VHOST_VSOCK_MINOR, .name = "vhost-vsock", .fops = &vhost_vsock_fops, }; static int __init vhost_vsock_init(void) { int ret; ret = vsock_core_register(&vhost_transport.transport, VSOCK_TRANSPORT_F_H2G); if (ret < 0) return ret; ret = misc_register(&vhost_vsock_misc); if (ret) { vsock_core_unregister(&vhost_transport.transport); return ret; } return 0; }; static void __exit vhost_vsock_exit(void) { misc_deregister(&vhost_vsock_misc); vsock_core_unregister(&vhost_transport.transport); }; module_init(vhost_vsock_init); module_exit(vhost_vsock_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Asias He"); MODULE_DESCRIPTION("vhost transport for vsock "); MODULE_ALIAS_MISCDEV(VHOST_VSOCK_MINOR); MODULE_ALIAS("devname:vhost-vsock");
9 136 1 58 178 41 10 1 17 31 370 4 666 667 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* SPDX-License-Identifier: GPL-2.0 */ /* * Connection state tracking for netfilter. This is separated from, * but required by, the (future) NAT layer; it can also be used by an iptables * extension. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack.h */ #ifndef _NF_CONNTRACK_H #define _NF_CONNTRACK_H #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <net/netfilter/nf_conntrack_tuple.h> struct nf_ct_udp { unsigned long stream_ts; }; /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; struct nf_ct_gre gre; unsigned int tmpl_padto; }; union nf_conntrack_expect_proto { /* insert expect proto private data here */ }; struct nf_conntrack_net_ecache { struct delayed_work dwork; spinlock_t dying_lock; struct hlist_nulls_head dying_list; }; struct nf_conntrack_net { /* only used when new connection is allocated: */ atomic_t count; unsigned int expect_count; /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS struct nf_conntrack_net_ecache ecache; #endif }; #include <linux/types.h> #include <linux/skbuff.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> struct nf_conn { /* Usage count in here is 1 for hash table, 1 per skb, * plus 1 for any connection(s) we are `master' for * * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, * except that the latter uses internal indirection and does not * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; spinlock_t lock; /* jiffies32 when this ct is considered dead */ u32 timeout; #ifdef CONFIG_NF_CONNTRACK_ZONES struct nf_conntrack_zone zone; #endif /* XXX should I move this to the tail ? - Y.K */ /* These are my tuples; original and reply */ struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; /* Have we seen traffic both ways yet? (bitset) */ unsigned long status; possible_net_t ct_net; #if IS_ENABLED(CONFIG_NF_NAT) struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ struct { } __nfct_init_offset; /* If we were expected by an expectation, this will be it */ struct nf_conn *master; #if defined(CONFIG_NF_CONNTRACK_MARK) u_int32_t mark; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK u_int32_t secmark; #endif /* Extensions */ struct nf_ct_ext *ext; /* Storage reserved for other modules, must be the last member */ union nf_conntrack_proto proto; }; static inline struct nf_conn * nf_ct_to_nf_conn(const struct nf_conntrack *nfct) { return container_of(nfct, struct nf_conn, ct_general); } static inline struct nf_conn * nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash) { return container_of(hash, struct nf_conn, tuplehash[hash->tuple.dst.dir]); } static inline u_int16_t nf_ct_l3num(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; } static inline u_int8_t nf_ct_protonum(const struct nf_conn *ct) { return ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum; } #define nf_ct_tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) /* get master conntrack via master expectation */ #define master_ct(conntr) (conntr->master) extern struct net init_net; static inline struct net *nf_ct_net(const struct nf_conn *ct) { return read_pnet(&ct->ct_net); } /* Is this tuple taken? (ignoring any belonging to the given conntrack). */ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack); /* Return conntrack_info and tuple hash for given skb. */ static inline struct nf_conn * nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { unsigned long nfct = skb_get_nfct(skb); *ctinfo = nfct & NFCT_INFOMASK; return (struct nf_conn *)(nfct & NFCT_PTRMASK); } void nf_ct_destroy(struct nf_conntrack *nfct); void nf_conntrack_tcp_set_closing(struct nf_conn *ct); /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { if (ct && refcount_dec_and_test(&ct->ct_general.use)) nf_ct_destroy(&ct->ct_general); } /* load module; enable/disable conntrack in this namespace */ int nf_ct_netns_get(struct net *net, u8 nfproto); void nf_ct_netns_put(struct net *net, u8 nfproto); /* * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); int nf_conntrack_hash_check_insert(struct nf_conn *ct); bool nf_ct_delete(struct nf_conn *ct, u32 pid, int report); bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, struct net *net, struct nf_conntrack_tuple *tuple); void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, u32 extra_jiffies, unsigned int bytes); /* Refresh conntrack for this many jiffies and do accounting */ static inline void nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, ctinfo, extra_jiffies, skb->len); } /* Refresh conntrack for this many jiffies */ static inline void nf_ct_refresh(struct nf_conn *ct, u32 extra_jiffies) { __nf_ct_refresh_acct(ct, 0, extra_jiffies, 0); } /* kill conntrack and do accounting */ bool nf_ct_kill_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb); /* kill conntrack without accounting */ static inline bool nf_ct_kill(struct nf_conn *ct) { return nf_ct_delete(ct, 0, 0); } struct nf_ct_iter_data { struct net *net; void *data; u32 portid; int report; }; /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), const struct nf_ct_iter_data *iter_data); /* also set unconfirmed conntracks as dying. Only use in module exit path. */ void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data); struct nf_conntrack_zone; void nf_conntrack_free(struct nf_conn *ct); struct nf_conn *nf_conntrack_alloc(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); static inline int nf_ct_is_template(const struct nf_conn *ct) { return test_bit(IPS_TEMPLATE_BIT, &ct->status); } /* It's confirmed if it is, or has been in the hash table. */ static inline int nf_ct_is_confirmed(const struct nf_conn *ct) { return test_bit(IPS_CONFIRMED_BIT, &ct->status); } static inline int nf_ct_is_dying(const struct nf_conn *ct) { return test_bit(IPS_DYING_BIT, &ct->status); } /* Packet is received from loopback */ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) { return skb->dev && skb->skb_iif && skb->dev->flags & IFF_LOOPBACK; } static inline void nf_conntrack_alter_reply(struct nf_conn *ct, const struct nf_conntrack_tuple *newreply) { /* Must be unconfirmed, so not in hash table yet */ if (WARN_ON(nf_ct_is_confirmed(ct))) return; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; } #define nfct_time_stamp ((u32)(jiffies)) /* jiffies until ct expires, 0 if already expired */ static inline unsigned long nf_ct_expires(const struct nf_conn *ct) { s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; return max(timeout, 0); } static inline bool nf_ct_is_expired(const struct nf_conn *ct) { return (__s32)(READ_ONCE(ct->timeout) - nfct_time_stamp) <= 0; } /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) struct kernel_param; int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp); int nf_conntrack_hash_resize(unsigned int hashsize); extern struct hlist_nulls_head *nf_conntrack_hash; extern unsigned int nf_conntrack_htable_size; extern seqcount_spinlock_t nf_conntrack_generation; extern unsigned int nf_conntrack_max; /* must be called with rcu read lock held */ static inline void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize) { struct hlist_nulls_head *hptr; unsigned int sequence, hsz; do { sequence = read_seqcount_begin(&nf_conntrack_generation); hsz = nf_conntrack_htable_size; hptr = nf_conntrack_hash; } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); *hash = hptr; *hsize = hsz; } struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags); void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) { skb_set_nfct(skb, (unsigned long)ct | info); } extern unsigned int nf_conntrack_net_id; static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net) { return net_generic(net, nf_conntrack_net_id); } int nf_ct_skb_network_trim(struct sk_buff *skb, int family); int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, u16 zone, u8 family, u8 *proto, u16 *mru); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_ADD_ATOMIC(net, count, v) this_cpu_add((net)->ct.stat->count, (v)) #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper) #endif /* _NF_CONNTRACK_H */
10 10 10 10 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> */ #ifndef __NET_TC_VLAN_H #define __NET_TC_VLAN_H #include <net/act_api.h> #include <linux/tc_act/tc_vlan.h> struct tcf_vlan_params { int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; bool tcfv_push_prio_exists; struct rcu_head rcu; }; struct tcf_vlan { struct tc_action common; struct tcf_vlan_params __rcu *vlan_p; }; #define to_vlan(a) ((struct tcf_vlan *)a) static inline bool is_tcf_vlan(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_VLAN) return true; #endif return false; } static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; rcu_read_lock(); tcfv_action = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_action; rcu_read_unlock(); return tcfv_action; } static inline u16 tcf_vlan_push_vid(const struct tc_action *a) { u16 tcfv_push_vid; rcu_read_lock(); tcfv_push_vid = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_vid; rcu_read_unlock(); return tcfv_push_vid; } static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) { __be16 tcfv_push_proto; rcu_read_lock(); tcfv_push_proto = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_proto; rcu_read_unlock(); return tcfv_push_proto; } static inline u8 tcf_vlan_push_prio(const struct tc_action *a) { u8 tcfv_push_prio; rcu_read_lock(); tcfv_push_prio = rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_prio; rcu_read_unlock(); return tcfv_push_prio; } static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, const struct tc_action *a) { rcu_read_lock(); memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); rcu_read_unlock(); } #endif /* __NET_TC_VLAN_H */
35 1 1376 2455 1280 1160 104 41 6 2455 2448 1354 1 1361 1473 221 1733 103 64 40 104 104 151 151 150 149 151 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/if.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/static_key.h> #include <linux/module.h> #include <linux/netfilter_defs.h> #include <linux/netdevice.h> #include <linux/sockptr.h> #include <net/net_namespace.h> static inline int NF_DROP_GETERR(int verdict) { return -(verdict >> NF_VERDICT_QBITS); } static __always_inline int NF_DROP_REASON(struct sk_buff *skb, enum skb_drop_reason reason, u32 err) { BUILD_BUG_ON(err > 0xffff); kfree_skb_reason(skb, reason); return ((err << 16) | NF_STOLEN); } static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; #endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ua = (const unsigned long *)a1; unsigned long *ur = (unsigned long *)result; const unsigned long *um = (const unsigned long *)mask; ur[0] = ua[0] & um[0]; ur[1] = ua[1] & um[1]; #else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; #endif } int netfilter_init(void); struct sk_buff; struct nf_hook_ops; struct sock; struct nf_hook_state { u8 hook; u8 pf; struct net_device *in; struct net_device *out; struct sock *sk; struct net *net; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, }; struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; u8 pf; enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; struct nf_hook_entry { nf_hookfn *hook; void *priv; }; struct nf_hook_entries_rcu_head { struct rcu_head head; void *allocation; }; struct nf_hook_entries { u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; /* trailer: pointers to original orig_ops of each hook, * followed by rcu_head and scratch space used for freeing * the structure via call_rcu. * * This is not part of struct nf_hook_entry since its only * needed in slow path (hook register/unregister): * const struct nf_hook_ops *orig_ops[] * * For the same reason, we store this at end -- its * only needed when a hook is deleted, not during * packet path processing: * struct nf_hook_entries_rcu_head head */ }; #ifdef CONFIG_NETFILTER static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { unsigned int n = e->num_hook_entries; const void *hook_end; hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ return (struct nf_hook_ops **)hook_end; } static inline int nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, struct nf_hook_state *state) { return entry->hook(entry->priv, skb, state); } static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, struct net_device *indev, struct net_device *outdev, struct sock *sk, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { p->hook = hook; p->pf = pf; p->in = indev; p->out = outdev; p->sk = sk; p->net = net; p->okfn = okfn; } struct nf_sockopt_ops { struct list_head list; u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, sockptr_t arg, unsigned int len); int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; }; /* Function to register/unregister hook points. */ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); return ret; } /* Activate hook; either okfn or kfree_skb called, unless a hook returns NF_STOLEN (in which case, it's up to the hook to deal with the consequences). Returns -ERRNO if packet dropped. Zero means queued, stolen or accepted. */ /* RR: > I don't want nf_hook to return anything because people might forget > about async and trust the return value to mean "packet was ok". AK: Just document it clearly, then you can expect some sense from kernel coders :) */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { int ret; if (!cond || ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1)) ret = okfn(net, sk, skb); return ret; } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret; } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); } rcu_read_unlock(); } /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); struct flowi; struct nf_queue_entry; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); #include <net/flow.h> struct nf_conn; enum nf_nat_manip_type; struct nlattr; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } #else /* !CONFIG_NETFILTER */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { return okfn(net, sk, skb); } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return okfn(net, sk, skb); } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { /* nothing to do */ } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return 1; } struct flowi; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { } #endif /*CONFIG_NETFILTER*/ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_zones_common.h> void nf_ct_attach(struct sk_buff *, const struct sk_buff *); void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { return false; } #endif struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); int (*confirm)(struct sk_buff *skb); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; struct nf_defrag_hook { struct module *owner; int (*enable)(struct net *net); void (*disable)(struct net *net); }; extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* * nf_skb_duplicated - TEE target has sent a packet * * When a xtables target sends a packet, the OUTPUT and POSTROUTING * hooks are traversed again, i.e. nft and xtables are invoked recursively. * * This is used by xtables TEE target to prevent the duplicated skb from * being duplicated again. */ DECLARE_PER_CPU(bool, nf_skb_duplicated); /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/
551 553 546 79 520 525 578 572 582 582 544 56 25 529 544 476 22 271 56 20 136 149 473 475 71 71 37 40 3 37 37 13 32 37 38 35 386 571 18 576 579 65 512 18 501 513 53 53 379 2 2 79 79 2 79 79 79 79 79 6 74 51 51 417 369 410 410 51 317 109 410 410 185 410 29 385 2 3 410 370 370 365 418 79 79 504 111 504 510 1 11 73 461 516 517 201 200 17 257 98 510 1 20 3 3 536 538 539 24 24 1 35 510 510 510 482 500 476 274 207 15 15 213 475 400 11 11 207 206 39 37 207 39 200 11 50 176 11 11 2 6 11 238 207 182 367 367 73 309 366 226 367 21 139 127 225 123 17 115 119 113 11 53 53 53 12 53 53 53 53 244 183 62 10 53 645 645 646 578 17 2 564 4 543 18 170 569 14 568 4 559 562 582 580 18 13 18 18 18 45 2 18 583 483 12 606 1 613 613 69 542 611 613 614 70 538 608 70 543 607 1 614 479 39 178 70 69 545 542 39 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/memfd.h> #include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/pagevec.h> #include <linux/sched/mm.h> #include <linux/shmem_fs.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" struct follow_page_context { struct dev_pagemap *pgmap; unsigned int page_mask; }; static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; /* * We only pin anonymous pages if they are exclusive. Once pinned, we * can no longer turn them possibly shared and PageAnonExclusive() will * stick around until the page is freed. * * We'd like to verify that our pinned anonymous pages are still mapped * exclusively. The issue with anon THP is that we don't know how * they are/were mapped when pinning them. However, for anon * THP we can assume that either the given page (PTE-mapped THP) or * the head page (PMD-mapped THP) should be PageAnonExclusive(). If * neither is the case, there is certainly something wrong. */ for (; npages; npages--, pages++) { struct page *page = *pages; struct folio *folio; if (!page) continue; folio = page_folio(page); if (is_zero_page(page) || !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); else /* Either a PTE-mapped or a PMD-mapped THP. */ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && !PageAnonExclusive(page), page); } } /* * Return the folio with ref appropriately incremented, * or NULL if that failed. */ static inline struct folio *try_get_folio(struct page *page, int refs) { struct folio *folio; retry: folio = page_folio(page); if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) return NULL; if (unlikely(!folio_ref_try_add(folio, refs))) return NULL; /* * At this point we have a stable reference to the folio; but it * could be that between calling page_folio() and the refcount * increment, the folio was split, in which case we'd end up * holding a reference on a folio that has nothing to do with the page * we were given anymore. * So now that the folio is stable, recheck that the page still * belongs to this folio. */ if (unlikely(page_folio(page) != folio)) { folio_put_refs(folio, refs); goto retry; } return folio; } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { if (is_zero_folio(folio)) return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_has_pincount(folio)) atomic_sub(refs, &folio->_pincount); else refs *= GUP_PIN_COUNTING_BIAS; } folio_put_refs(folio, refs); } /** * try_grab_folio() - add a folio's refcount by a flag-dependent amount * @folio: pointer to folio to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values * * This might not do anything at all, depending on the flags argument. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same * time. * * Return: 0 for success, or if no action was required (if neither FOLL_PIN * nor FOLL_GET was set, nothing is done). A negative error code for failure: * * -ENOMEM FOLL_GET or FOLL_PIN was set, but the folio could not * be grabbed. * * It is called when we have a stable reference for the folio, typically in * GUP slow path. */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags) { if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_add(folio, refs); else if (flags & FOLL_PIN) { /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_folio(folio)) return 0; /* * Increment the normal page refcount field at least once, * so that the page really is pinned. */ if (folio_has_pincount(folio)) { folio_ref_add(folio, refs); atomic_add(refs, &folio->_pincount); } else { folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS); } node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); } return 0; } /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so * that such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. */ void unpin_user_page(struct page *page) { sanity_check_pinned_pages(&page, 1); gup_put_folio(page_folio(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); /** * unpin_folio() - release a dma-pinned folio * @folio: pointer to folio to be released * * Folios that were pinned via memfd_pin_folios() or other similar routines * must be released either using unpin_folio() or unpin_folios(). */ void unpin_folio(struct folio *folio) { gup_put_folio(folio, 1, FOLL_PIN); } EXPORT_SYMBOL_GPL(unpin_folio); /** * folio_add_pin - Try to get an additional pin on a pinned folio * @folio: The folio to be pinned * * Get an additional pin on a folio we already have a pin on. Makes no change * if the folio is a zero_page. */ void folio_add_pin(struct folio *folio) { if (is_zero_folio(folio)) return; /* * Similar to try_grab_folio(): be sure to *also* increment the normal * page refcount field at least once, so that the page really is * pinned. */ if (folio_has_pincount(folio)) { WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); folio_ref_inc(folio); atomic_inc(&folio->_pincount); } else { WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); } } static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { struct page *next = nth_page(start, i); struct folio *folio = page_folio(next); unsigned int nr = 1; if (folio_test_large(folio)) nr = min_t(unsigned int, npages - i, folio_nr_pages(folio) - folio_page_idx(folio, next)); *ntails = nr; return folio; } static inline struct folio *gup_folio_next(struct page **list, unsigned long npages, unsigned long i, unsigned int *ntails) { struct folio *folio = page_folio(list[i]); unsigned int nr; for (nr = i + 1; nr < npages; nr++) { if (page_folio(list[nr]) != folio) break; } *ntails = nr - i; return folio; } /** * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page" refers to a page that has had one of the get_user_pages() * variants called on that page. * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously * listed as clean. In any case, releases all pages using unpin_user_page(), * possibly via unpin_user_pages(), for the non-dirty case. * * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; if (!make_dirty) { unpin_user_pages(pages, npages); return; } sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); /* * Checking PageDirty at this point may race with * clear_page_dirty_for_io(), but that's OK. Two key * cases: * * 1) This code sees the page as already dirty, so it * skips the call to set_page_dirty(). That could happen * because clear_page_dirty_for_io() called * folio_mkclean(), followed by set_page_dirty(). * However, now the page is going to get written back, * which meets the original intention of setting it * dirty, so all is well: clear_page_dirty_for_io() goes * on to call TestClearPageDirty(), and write the page * back. * * 2) This code sees the page as clean, so it calls * set_page_dirty(). The page stays dirty, despite being * written back, so it gets written back again in the * next writeback cycle. This is harmless. */ if (!folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** * unpin_user_page_range_dirty_lock() - release and optionally dirty * gup-pinned page range * * @page: the starting page of a range maybe marked dirty, and definitely released. * @npages: number of consecutive pages to release. * @make_dirty: whether to mark the pages dirty * * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: * set_page_dirty_lock(), unpin_user_page(). * */ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty) { unsigned long i; struct folio *folio; unsigned int nr; for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { folio_lock(folio); folio_mark_dirty(folio); folio_unlock(folio); } gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * Don't perform any sanity checks because we might have raced with * fork() and some anonymous pages might now actually be shared -- * which is why we're unpinning after all. */ for (i = 0; i < npages; i += nr) { folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } /** * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * * For each page in the @pages array, release the page using unpin_user_page(). * * Please see the unpin_user_page() documentation for details. */ void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long i; struct folio *folio; unsigned int nr; /* * If this WARN_ON() fires, then the system *might* be leaking pages (by * leaving them pinned), but probably not. More likely, gup/pup returned * a hard -ERRNO error to the caller, who erroneously passed it here. */ if (WARN_ON(IS_ERR_VALUE(npages))) return; sanity_check_pinned_pages(pages, npages); for (i = 0; i < npages; i += nr) { if (!pages[i]) { nr = 1; continue; } folio = gup_folio_next(pages, npages, i, &nr); gup_put_folio(folio, nr, FOLL_PIN); } } EXPORT_SYMBOL(unpin_user_pages); /** * unpin_user_folio() - release pages of a folio * @folio: pointer to folio to be released * @npages: number of pages of same folio * * Release npages of the folio */ void unpin_user_folio(struct folio *folio, unsigned long npages) { gup_put_folio(folio, npages, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_folio); /** * unpin_folios() - release an array of gup-pinned folios. * @folios: array of folios to be marked dirty and released. * @nfolios: number of folios in the @folios array. * * For each folio in the @folios array, release the folio using gup_put_folio. * * Please see the unpin_folio() documentation for details. */ void unpin_folios(struct folio **folios, unsigned long nfolios) { unsigned long i = 0, j; /* * If this WARN_ON() fires, then the system *might* be leaking folios * (by leaving them pinned), but probably not. More likely, gup/pup * returned a hard -ERRNO error to the caller, who erroneously passed * it here. */ if (WARN_ON(IS_ERR_VALUE(nfolios))) return; while (i < nfolios) { for (j = i + 1; j < nfolios; j++) if (folios[i] != folios[j]) break; if (folios[i]) gup_put_folio(folios[i], j - i, FOLL_PIN); i = j; } } EXPORT_SYMBOL_GPL(unpin_folios); /* * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) { if (!test_bit(MMF_HAS_PINNED, mm_flags)) set_bit(MMF_HAS_PINNED, mm_flags); } #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) { struct page *start_page; int nr; start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) pages[nr] = nth_page(start_page, nr); return nr; } /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed * @refs: the value to (effectively) add to the folio's refcount * @flags: gup flags: these are the FOLL_* flag values. * * "grab" names in this file mean, "look at flags to decide whether to use * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. * * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the * same time. (That's true throughout the get_user_pages*() and * pin_user_pages*() APIs.) Cases: * * FOLL_GET: folio's refcount will be incremented by @refs. * * FOLL_PIN on large folios: folio's refcount will be incremented by * @refs, and its pincount will be incremented by @refs. * * FOLL_PIN on single-page folios: folio's refcount will be incremented by * @refs * GUP_PIN_COUNTING_BIAS. * * Return: The folio containing @page (with refcount appropriately * incremented) for success, or NULL upon failure. If neither FOLL_GET * nor FOLL_PIN was set, that's considered failure, and furthermore, * a likely bug in the caller, so a warning is also emitted. * * It uses add ref unless zero to elevate the folio refcount and must be called * in fast path only. */ static struct folio *try_grab_folio_fast(struct page *page, int refs, unsigned int flags) { struct folio *folio; /* Raise warn if it is not called in fast GUP */ VM_WARN_ON_ONCE(!irqs_disabled()); if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0)) return NULL; if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL; if (flags & FOLL_GET) return try_get_folio(page, refs); /* FOLL_PIN is set */ /* * Don't take a pin on the zero page - it's not going anywhere * and it is used in a *lot* of places. */ if (is_zero_page(page)) return page_folio(page); folio = try_get_folio(page, refs); if (!folio) return NULL; /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow * path. */ if (unlikely((flags & FOLL_LONGTERM) && !folio_is_longterm_pinnable(folio))) { folio_put_refs(folio, refs); return NULL; } /* * When pinning a large folio, use an exact count to track it. * * However, be sure to *also* increment the normal folio * refcount field at least once, so that the folio really * is pinned. That's why the refcount from the earlier * try_get_folio() is left intact. */ if (folio_has_pincount(folio)) atomic_add(refs, &folio->_pincount); else folio_ref_add(folio, refs * (GUP_PIN_COUNTING_BIAS - 1)); /* * Adjust the pincount before re-checking the PTE for changes. * This is essentially a smp_mb() and is paired with a memory * barrier in folio_try_share_anon_rmap_*(). */ smp_mb__after_atomic(); node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); return folio; } #endif /* CONFIG_HAVE_GUP_FAST */ /* Common code for can_follow_write_* */ static inline bool can_follow_write_common(struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* Maybe FOLL_FORCE is set to override it? */ if (!(flags & FOLL_FORCE)) return false; /* But FOLL_FORCE has no effect on shared mappings */ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) return false; /* ... or read-only private ones */ if (!(vma->vm_flags & VM_MAYWRITE)) return false; /* ... or already writable ones that just need to take a write fault */ if (vma->vm_flags & VM_WRITE) return false; /* * See can_change_pte_writable(): we broke COW and could map the page * writable if we have an exclusive anonymous page ... */ return page && PageAnon(page) && PageAnonExclusive(page); } static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { if (!(flags & FOLL_DUMP)) return NULL; /* * When core dumping, we don't want to allocate unnecessary pages or * page tables. Return error instead of NULL to skip handle_mm_fault, * then get_dump_page() will return NULL to leave a hole in the dump. * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ if (is_vm_hugetlb_page(vma)) { struct hstate *h = hstate_vma(vma); if (!hugetlbfs_pagecache_present(h, vma, address)) return ERR_PTR(-EFAULT); } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) { return ERR_PTR(-EFAULT); } return NULL; } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES /* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ static inline bool can_follow_write_pud(pud_t pud, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pud is writable, we can write to the page. */ if (pud_write(pud)) return true; return can_follow_write_common(page, vma, flags); } static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { struct mm_struct *mm = vma->vm_mm; struct page *page; pud_t pud = *pudp; unsigned long pfn = pud_pfn(pud); int ret; assert_spin_locked(pud_lockptr(mm, pudp)); if (!pud_present(pud)) return NULL; if ((flags & FOLL_WRITE) && !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && pud_devmap(pud)) { /* * device mapped pages can only be returned if the caller * will manage the page reference count. * * At least one of FOLL_GET | FOLL_PIN must be set, so * assert that here: */ if (!(flags & (FOLL_GET | FOLL_PIN))) return ERR_PTR(-EEXIST); if (flags & FOLL_TOUCH) touch_pud(vma, addr, pudp, flags & FOLL_WRITE); ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap); if (!ctx->pgmap) return ERR_PTR(-EFAULT); } page = pfn_to_page(pfn); if (!pud_devmap(pud) && !pud_write(pud) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) page = ERR_PTR(ret); else ctx->page_mask = HPAGE_PUD_NR - 1; return page; } /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pmd is writable, we can write to the page. */ if (pmd_write(pmd)) return true; if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ if (pmd_needs_soft_dirty_wp(vma, pmd)) return false; return !userfaultfd_huge_pmd_wp(vma, pmd); } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { struct mm_struct *mm = vma->vm_mm; pmd_t pmdval = *pmd; struct page *page; int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); page = pmd_page(pmdval); if ((flags & FOLL_WRITE) && !can_follow_write_pmd(pmdval, page, vma, flags)) return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval)) return ERR_PTR(-EFAULT); if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) return NULL; if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); ret = try_grab_folio(page_folio(page), 1, flags); if (ret) return ERR_PTR(ret); #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH)) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; ctx->page_mask = HPAGE_PMD_NR - 1; return page; } #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) { return NULL; } static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, struct follow_page_context *ctx) { return NULL; } #endif /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, pte_t *pte, unsigned int flags) { if (flags & FOLL_TOUCH) { pte_t orig_entry = ptep_get(pte); pte_t entry = orig_entry; if (flags & FOLL_WRITE) entry = pte_mkdirty(entry); entry = pte_mkyoung(entry); if (!pte_same(orig_entry, entry)) { set_pte_at(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } /* Proper page table entry exists, but no corresponding struct page */ return -EEXIST; } /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ static inline bool can_follow_write_pte(pte_t pte, struct page *page, struct vm_area_struct *vma, unsigned int flags) { /* If the pte is writable, we can write to the page. */ if (pte_write(pte)) return true; if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ if (pte_needs_soft_dirty_wp(vma, pte)) return false; return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, struct dev_pagemap **pgmap) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; int ret; /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) return no_page_table(vma, flags, address); pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); /* * We only care about anon pages in can_follow_write_pte() and don't * have to worry about pte_devmap() because they are never anon. */ if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, page, vma, flags)) { page = NULL; goto out; } if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN * case since they are only valid while holding the pgmap * reference. */ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); if (*pgmap) page = pte_page(pte); else goto no_page; } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); goto out; } if (is_zero_pfn(pte_pfn(pte))) { page = pte_page(pte); } else { ret = follow_pfn_pte(vma, address, ptep, flags); page = ERR_PTR(ret); goto out; } } folio = page_folio(page); if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */ ret = try_grab_folio(folio, 1, flags); if (unlikely(ret)) { page = ERR_PTR(ret); goto out; } /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see * Documentation/core-api/pin_user_pages.rst for details. */ if (flags & FOLL_PIN) { ret = arch_make_folio_accessible(folio); if (ret) { unpin_user_page(page); page = ERR_PTR(ret); goto out; } } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !folio_test_dirty(folio)) folio_mark_dirty(folio); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use * folio_mark_accessed(). */ folio_mark_accessed(folio); } out: pte_unmap_unlock(ptep, ptl); return page; no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return NULL; return no_page_table(vma, flags, address); } static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, struct follow_page_context *ctx) { pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); pmdval = pmdp_get_lockless(pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags, address); if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); spin_unlock(ptl); if (page) return page; return no_page_table(vma, flags, address); } if (likely(!pmd_leaf(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); ptl = pmd_lock(mm, pmd); pmdval = *pmd; if (unlikely(!pmd_present(pmdval))) { spin_unlock(ptl); return no_page_table(vma, flags, address); } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); } page = follow_huge_pmd(vma, address, pmd, flags, ctx); spin_unlock(ptl); return page; } static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, struct follow_page_context *ctx) { pud_t *pudp, pud; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pudp = pud_offset(p4dp, address); pud = READ_ONCE(*pudp); if (!pud_present(pud)) return no_page_table(vma, flags, address); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); page = follow_huge_pud(vma, address, pudp, flags, ctx); spin_unlock(ptl); if (page) return page; return no_page_table(vma, flags, address); } if (unlikely(pud_bad(pud))) return no_page_table(vma, flags, address); return follow_pmd_mask(vma, address, pudp, flags, ctx); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, struct follow_page_context *ctx) { p4d_t *p4dp, p4d; p4dp = p4d_offset(pgdp, address); p4d = READ_ONCE(*p4dp); BUILD_BUG_ON(p4d_leaf(p4d)); if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); return follow_pud_mask(vma, address, p4dp, flags, ctx); } /** * follow_page_mask - look up a page descriptor from a user-virtual address * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in <linux/mm.h> * * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches * the device's dev_pagemap metadata to avoid repeating expensive lookups. * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * * On output, the @ctx->page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct follow_page_context *ctx) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; struct page *page; vma_pgtable_walk_begin(vma); ctx->page_mask = 0; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) page = no_page_table(vma, flags, address); else page = follow_p4d_mask(vma, address, pgd, flags, ctx); vma_pgtable_walk_end(vma); return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t entry; int ret = -EFAULT; /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) return -EFAULT; if (address > TASK_SIZE) pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); if (pgd_none(*pgd)) return -EFAULT; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return -EFAULT; pud = pud_offset(p4d, address); if (pud_none(*pud)) return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; pte = pte_offset_map(pmd, address); if (!pte) return -EFAULT; entry = ptep_get(pte); if (pte_none(entry)) goto unmap; *vma = get_gate_vma(mm); if (!page) goto out; *page = vm_normal_page(*vma, address, entry); if (!*page) { if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry))) goto unmap; *page = pte_page(entry); } ret = try_grab_folio(page_folio(*page), 1, gup_flags); if (unlikely(ret)) goto unmap; out: ret = 0; unmap: pte_unmap(pte); return ret; } /* * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set * to 0 and -EBUSY returned. */ static int faultin_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags, bool unshare, int *locked) { unsigned int fault_flags = 0; vm_fault_t ret; if (flags & FOLL_NOFAULT) return -EFAULT; if (flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; if (flags & FOLL_UNLOCKABLE) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; /* * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. * That's because some callers may not be prepared to * handle early exits caused by non-fatal signals. */ if (flags & FOLL_INTERRUPTIBLE) fault_flags |= FAULT_FLAG_INTERRUPTIBLE; } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (flags & FOLL_TRIED) { /* * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED * can co-exist */ fault_flags |= FAULT_FLAG_TRIED; } if (unshare) { fault_flags |= FAULT_FLAG_UNSHARE; /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); } ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * With FAULT_FLAG_RETRY_NOWAIT we'll never release the * mmap lock in the page fault handler. Sanity check this. */ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); *locked = 0; /* * We should do the same as VM_FAULT_RETRY, but let's not * return -EBUSY since that's not reflecting the reality of * what has happened - we've just fully completed a page * fault, with the mmap lock released. Use -EAGAIN to show * that we want to take the mmap lock _again_. */ return -EAGAIN; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, flags); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) *locked = 0; return -EBUSY; } return 0; } /* * Writing to file-backed mappings which require folio dirty tracking using GUP * is a fundamentally broken operation, as kernel write access to GUP mappings * do not adhere to the semantics expected by a file system. * * Consider the following scenario:- * * 1. A folio is written to via GUP which write-faults the memory, notifying * the file system and dirtying the folio. * 2. Later, writeback is triggered, resulting in the folio being cleaned and * the PTE being marked read-only. * 3. The GUP caller writes to the folio, as it is mapped read/write via the * direct mapping. * 4. The GUP caller, now done with the page, unpins it and sets it dirty * (though it does not have to). * * This results in both data being written to a folio without writenotify, and * the folio being dirtied unexpectedly (if the caller decides to do so). */ static bool writable_file_mapping_allowed(struct vm_area_struct *vma, unsigned long gup_flags) { /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the case we disallow. */ if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) != (FOLL_PIN | FOLL_LONGTERM)) return true; /* * If the VMA does not require dirty tracking then no problematic write * can occur either. */ return !vma_needs_dirty_tracking(vma); } static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; int write = (gup_flags & FOLL_WRITE); int foreign = (gup_flags & FOLL_REMOTE); bool vma_anon = vma_is_anonymous(vma); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; if ((gup_flags & FOLL_ANON) && !vma_anon) return -EFAULT; if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma)) return -EOPNOTSUPP; if (vma_is_secretmem(vma)) return -EFAULT; if (write) { if (!vma_anon && !writable_file_mapping_allowed(vma, gup_flags)) return -EFAULT; if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could * set a breakpoint in a read-only mapping of an * executable, without corrupting the file (yet only * when that file had been opened for writing!). * Anon pages in shared mappings are surprising: now * just reject it. */ if (!is_cow_mapping(vm_flags)) return -EFAULT; } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; /* * Is there actually any vma we can reach here which does not * have VM_MAYREAD set? */ if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } /* * gups are always data accesses, not instruction * fetches, so execute=false here */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return -EFAULT; return 0; } /* * This is "vma_lookup()", but with a warning if we would have * historically expanded the stack in the GUP code. */ static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm, unsigned long addr) { #ifdef CONFIG_STACK_GROWSUP return vma_lookup(mm, addr); #else static volatile unsigned long next_warn; struct vm_area_struct *vma; unsigned long now, next; vma = find_vma(mm, addr); if (!vma || (addr >= vma->vm_start)) return vma; /* Only warn for half-way relevant accesses */ if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; if (vma->vm_start - addr > 65536) return NULL; /* Let's not warn more than once an hour.. */ now = jiffies; next = next_warn; if (next && time_before(now, next)) return NULL; next_warn = now + 60*60*HZ; /* Let people know things may have changed. */ pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n", current->comm, task_pid_nr(current), vma->vm_start, vma->vm_end, addr); dump_stack(); return NULL; #endif } /** * __get_user_pages() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: whether we're still with the mmap_lock held * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * -- 0 return value is possible when the fault would need to be retried. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * __get_user_pages returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If * the page is written to, set_page_dirty (or set_page_dirty_lock, as * appropriate) must be called after the page is finished with, and * before put_page is called. * * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may * be released. If this happens *@locked will be set to 0 on return. * * A caller using such a combination of @gup_flags must therefore hold the * mmap_lock for reading only, and recognize when it's been released. Otherwise, * it must be held for either reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if * you need some special @gup_flags. */ static long __get_user_pages(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; struct follow_page_context ctx = { NULL }; if (!nr_pages) return 0; start = untagged_addr_remote(mm, start); VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); do { struct page *page; unsigned int page_increm; /* first iteration or cross vma bound */ if (!vma || start >= vma->vm_end) { /* * MADV_POPULATE_(READ|WRITE) wants to handle VMA * lookups+error reporting differently. */ if (gup_flags & FOLL_MADV_POPULATE) { vma = vma_lookup(mm, start); if (!vma) { ret = -ENOMEM; goto out; } if (check_vma_flags(vma, gup_flags)) { ret = -EINVAL; goto out; } goto retry; } vma = gup_vma_lookup(mm, start); if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; goto next_page; } if (!vma) { ret = -EFAULT; goto out; } ret = check_vma_flags(vma, gup_flags); if (ret) goto out; } retry: /* * If we have a pending SIGKILL, don't keep faulting pages and * potentially allocating memory. */ if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); page = follow_page_mask(vma, start, gup_flags, &ctx); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); switch (ret) { case 0: goto retry; case -EBUSY: case -EAGAIN: ret = 0; fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: goto out; } BUG(); } else if (PTR_ERR(page) == -EEXIST) { /* * Proper page table entry exists, but no corresponding * struct page. If the caller expects **pages to be * filled in, bail out now, because that can't be done * for this page. */ if (pages) { ret = PTR_ERR(page); goto out; } } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } next_page: page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; if (pages) { struct page *subpage; unsigned int j; /* * This must be a large folio (and doesn't need to * be the whole folio; it can be part of it), do * the refcount work for all the subpages too. * * NOTE: here the page may not be the head page * e.g. when start addr is not thp-size aligned. * try_grab_folio() should have taken care of tail * pages. */ if (page_increm > 1) { struct folio *folio = page_folio(page); /* * Since we already hold refcount on the * large folio, this should never fail. */ if (try_grab_folio(folio, page_increm - 1, gup_flags)) { /* * Release the 1st page ref if the * folio is problematic, fail hard. */ gup_put_folio(folio, 1, gup_flags); ret = -EFAULT; goto out; } } for (j = 0; j < page_increm; j++) { subpage = nth_page(page, j); pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); } } i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; } while (nr_pages); out: if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return i ? i : ret; } static bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; if (!(vm_flags & vma->vm_flags)) return false; /* * The architecture might have a hardware protection * mechanism other than read/write that can deny access. * * gup always represents data access, not instruction * fetches, so execute=false here: */ if (!arch_vma_access_permitted(vma, write, false, foreign)) return false; return true; } /** * fixup_user_fault() - manually resolve a user page fault * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller * does not allow retry. If NULL, the caller must guarantee * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() * section), this returns -EFAULT, and we want to resolve the user fault before * trying again. * * Typically this is meant to be used by the futex code. * * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * * This function will not return with an unlocked mmap_lock. So it has not the * same semantics wrt the @mm->mmap_lock as does filemap_fault(). */ int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { struct vm_area_struct *vma; vm_fault_t ret; address = untagged_addr_remote(mm, address); if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; retry: vma = gup_vma_lookup(mm, address); if (!vma) return -EFAULT; if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; if ((fault_flags & FAULT_FLAG_KILLABLE) && fatal_signal_pending(current)) return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); if (ret & VM_FAULT_COMPLETED) { /* * NOTE: it's a pity that we need to retake the lock here * to pair with the unlock() in the callers. Ideally we * could tell the callers so they do not need to unlock. */ mmap_read_lock(mm); *unlocked = true; return 0; } if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); if (err) return err; BUG(); } if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); *unlocked = true; fault_flags |= FAULT_FLAG_TRIED; goto retry; } return 0; } EXPORT_SYMBOL_GPL(fixup_user_fault); /* * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is * specified, it'll also respond to generic signals. The caller of GUP * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. */ static bool gup_signal_pending(unsigned int flags) { if (fatal_signal_pending(current)) return true; if (!(flags & FOLL_INTERRUPTIBLE)) return false; return signal_pending(current); } /* * Locking: (*locked == 1) means that the mmap_lock has already been acquired by * the caller. This function may drop the mmap_lock. If it does so, then it will * set (*locked = 0). * * (*locked == 0) means that the caller expects this function to acquire and * drop the mmap_lock. Therefore, the value of *locked will still be zero when * the function returns, even though it may have changed temporarily during * function execution. * * Please note that this function, unlike __get_user_pages(), will not return 0 * for nr_pages > 0, unless FOLL_NOWAIT is used. */ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int flags) { long ret, pages_done; bool must_unlock = false; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } else mmap_assert_locked(mm); if (flags & FOLL_PIN) mm_set_has_pinned_flag(&mm->flags); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has * carelessly failed to specify FOLL_GET), so keep doing that, but only * for FOLL_GET, not for the newer FOLL_PIN. * * FOLL_PIN always expects pages to be non-null, but no need to assert * that here, as any failures will be obvious enough. */ if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; for (;;) { ret = __get_user_pages(mm, start, nr_pages, flags, pages, locked); if (!(flags & FOLL_UNLOCKABLE)) { /* VM_FAULT_RETRY couldn't trigger, bypass */ pages_done = ret; break; } /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); } if (ret > 0) { nr_pages -= ret; pages_done += ret; if (!nr_pages) break; } if (*locked) { /* * VM_FAULT_RETRY didn't trigger or it was a * FOLL_NOWAIT. */ if (!pages_done) pages_done = ret; break; } /* * VM_FAULT_RETRY triggered, so seek to the faulting offset. * For the prefault case (!pages) we only update counts. */ if (likely(pages)) pages += ret; start += ret << PAGE_SHIFT; /* The lock was temporarily dropped, so we must unlock later */ must_unlock = true; retry: /* * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted * by fatal signals of even common signals, depending on * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; } ret = mmap_read_lock_killable(mm); if (ret) { BUG_ON(ret > 0); if (!pages_done) pages_done = ret; break; } *locked = 1; ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, pages, locked); if (!*locked) { /* Continue to retry until we succeeded */ BUG_ON(ret != 0); goto retry; } if (ret != 1) { BUG_ON(ret > 1); if (!pages_done) pages_done = ret; break; } nr_pages--; pages_done++; if (!nr_pages) break; if (likely(pages)) pages++; start += PAGE_SIZE; } if (must_unlock && *locked) { /* * We either temporarily dropped the lock, or the caller * requested that we both acquire and drop the lock. Either way, * we must now unlock, and notify the caller of that state. */ mmap_read_unlock(mm); *locked = 0; } /* * Failing to pin anything implies something has gone wrong (except when * FOLL_NOWAIT is specified). */ if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT))) return -EFAULT; return pages_done; } /** * populate_vma_page_range() - populate a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address * @locked: whether the mmap_lock is still held * * This takes care of mlocking the pages too if VM_LOCKED is set. * * Return either number of pages pinned in the vma, or a negative error * code on error. * * vma->vm_mm->mmap_lock must be held. * * If @locked is NULL, it may be held for read or write and will * be unperturbed. * * If @locked is non-NULL, it must held for read only and may be * released. If it's released, *@locked will be set to 0. */ long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked) { struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int local_locked = 1; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON_VMA(start < vma->vm_start, vma); VM_BUG_ON_VMA(end > vma->vm_end, vma); mmap_assert_locked(mm); /* * Rightly or wrongly, the VM_LOCKONFAULT case has never used * faultin_page() to break COW, so it has no work to do here. */ if (vma->vm_flags & VM_LOCKONFAULT) return nr_pages; /* ... similarly, we've never faulted in PROT_NONE pages */ if (!vma_is_accessible(vma)) return -EFAULT; gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW * and we would not want to dirty them for nothing. * * Otherwise, do a read fault, and use FOLL_FORCE in case it's not * readable (ie write-only or executable). */ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; else gup_flags |= FOLL_FORCE; if (locked) gup_flags |= FOLL_UNLOCKABLE; /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ ret = __get_user_pages(mm, start, nr_pages, gup_flags, NULL, locked ? locked : &local_locked); lru_add_drain(); return ret; } /* * faultin_page_range() - populate (prefault) page tables inside the * given range readable/writable * * This takes care of mlocking the pages, too, if VM_LOCKED is set. * * @mm: the mm to populate page tables in * @start: start address * @end: end address * @write: whether to prefault readable or writable * @locked: whether the mmap_lock is still held * * Returns either number of processed pages in the MM, or a negative error * code on error (see __get_user_pages()). Note that this function reports * errors related to VMAs, such as incompatible mappings, as expected by * MADV_POPULATE_(READ|WRITE). * * The range must be page-aligned. * * mm->mmap_lock must be held. If it's released, *@locked will be set to 0. */ long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked) { unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; long ret; VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(end)); mmap_assert_locked(mm); /* * FOLL_TOUCH: Mark page accessed and thereby young; will also mark * the page dirty with FOLL_WRITE -- which doesn't make a * difference with !FOLL_FORCE, because the page is writable * in the page table. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit * a poisoned page. * !FOLL_FORCE: Require proper access permissions. */ gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE | FOLL_MADV_POPULATE; if (write) gup_flags |= FOLL_WRITE; ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked, gup_flags); lru_add_drain(); return ret; } /* * __mm_populate - populate and/or mlock pages within a range of address space. * * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap * flags. VMAs must be already marked with the desired vm_flags, and * mmap_lock must not be held. */ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) { struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; long ret = 0; end = start + len; for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ if (!locked) { locked = 1; mmap_read_lock(mm); vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) vma = find_vma_intersection(mm, vma->vm_end, end); if (!vma) break; /* * Set [nstart; nend) to intersection of desired address * range with the first VMA. Also, skip undesirable VMA types. */ nend = min(end, vma->vm_end); if (vma->vm_flags & (VM_IO | VM_PFNMAP)) continue; if (nstart < vma->vm_start) nstart = vma->vm_start; /* * Now fault in a range of pages. populate_vma_page_range() * double checks the vma flags, so that it won't mlock pages * if the vma was already munlocked. */ ret = populate_vma_page_range(vma, nstart, nend, &locked); if (ret < 0) { if (ignore_errors) { ret = 0; continue; /* continue at next VMA */ } break; } nend = nstart + ret * PAGE_SIZE; ret = 0; } if (locked) mmap_read_unlock(mm); return ret; /* 0 or negative error code */ } #else /* CONFIG_MMU */ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int foll_flags) { struct vm_area_struct *vma; bool must_unlock = false; unsigned long vm_flags; long i; if (!nr_pages) return 0; /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. */ if (!*locked) { if (mmap_read_lock_killable(mm)) return -EAGAIN; must_unlock = true; *locked = 1; } /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. */ vm_flags = (foll_flags & FOLL_WRITE) ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= (foll_flags & FOLL_FORCE) ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) break; /* protect what we can, including chardevs */ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || !(vm_flags & vma->vm_flags)) break; if (pages) { pages[i] = virt_to_page((void *)start); if (pages[i]) get_page(pages[i]); } start = (start + PAGE_SIZE) & PAGE_MASK; } if (must_unlock && *locked) { mmap_read_unlock(mm); *locked = 0; } return i ? : -EFAULT; } #endif /* !CONFIG_MMU */ /** * fault_in_writeable - fault in userspace address range for writing * @uaddr: start of address range * @size: size of address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_writeable(char __user *uaddr, size_t size) { char __user *start = uaddr, *end; if (unlikely(size == 0)) return 0; if (!user_write_access_begin(uaddr, size)) return size; if (!PAGE_ALIGNED(uaddr)) { unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_writeable); /** * fault_in_subpage_writeable - fault in an address range for writing * @uaddr: start of address range * @size: size of address range * * Fault in a user address range for writing while checking for permissions at * sub-page granularity (e.g. arm64 MTE). This function should be used when * the caller cannot guarantee forward progress of a copy_to_user() loop. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) { size_t faulted_in; /* * Attempt faulting in at page granularity first for page table * permission checking. The arch-specific probe_subpage_writeable() * functions may not check for this. */ faulted_in = size - fault_in_writeable(uaddr, size); if (faulted_in) faulted_in -= probe_subpage_writeable(uaddr, faulted_in); return size - faulted_in; } EXPORT_SYMBOL(fault_in_subpage_writeable); /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range * @size: length of address range * * Faults in an address range for writing. This is primarily useful when we * already know that some or all of the pages in the address range aren't in * memory. * * Unlike fault_in_writeable(), this function is non-destructive. * * Note that we don't pin or otherwise hold the pages referenced that we fault * in. There's no guarantee that they'll stay in memory for any duration of * time. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). */ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) { unsigned long start = (unsigned long)uaddr, end; struct mm_struct *mm = current->mm; bool unlocked = false; if (unlikely(size == 0)) return 0; end = PAGE_ALIGN(start + size); if (end < start) end = 0; mmap_read_lock(mm); do { if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) break; start = (start + PAGE_SIZE) & PAGE_MASK; } while (start != end); mmap_read_unlock(mm); if (size > start - (unsigned long)uaddr) return size - (start - (unsigned long)uaddr); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); /** * fault_in_readable - fault in userspace address range for reading * @uaddr: start of user address range * @size: size of user address range * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). */ size_t fault_in_readable(const char __user *uaddr, size_t size) { const char __user *start = uaddr, *end; volatile char c; if (unlikely(size == 0)) return 0; if (!user_read_access_begin(uaddr, size)) return size; if (!PAGE_ALIGNED(uaddr)) { unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); return 0; } EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; int ret; ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION /* * An array of either pages or folios ("pofs"). Although it may seem tempting to * avoid this complication, by simply interpreting a list of folios as a list of * pages, that approach won't work in the longer term, because eventually the * layouts of struct page and struct folio will become completely different. * Furthermore, this pof approach avoids excessive page_folio() calls. */ struct pages_or_folios { union { struct page **pages; struct folio **folios; void **entries; }; bool has_folios; long nr_entries; }; static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i) { if (pofs->has_folios) return pofs->folios[i]; return page_folio(pofs->pages[i]); } static void pofs_clear_entry(struct pages_or_folios *pofs, long i) { pofs->entries[i] = NULL; } static void pofs_unpin(struct pages_or_folios *pofs) { if (pofs->has_folios) unpin_folios(pofs->folios, pofs->nr_entries); else unpin_user_pages(pofs->pages, pofs->nr_entries); } /* * Returns the number of collected folios. Return value is always >= 0. */ static void collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { struct folio *prev_folio = NULL; bool drain_allow = true; unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); if (folio == prev_folio) continue; prev_folio = folio; if (folio_is_longterm_pinnable(folio)) continue; if (folio_is_device_coherent(folio)) continue; if (folio_test_hugetlb(folio)) { folio_isolate_hugetlb(folio, movable_folio_list); continue; } if (!folio_test_lru(folio) && drain_allow) { lru_add_drain_all(); drain_allow = false; } if (!folio_isolate_lru(folio)) continue; list_add_tail(&folio->lru, movable_folio_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } } /* * Unpins all folios and migrates device coherent folios and movable_folio_list. * Returns -EAGAIN if all folios were successfully migrated or -errno for * failure (or partial success). */ static int migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list, struct pages_or_folios *pofs) { int ret; unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); if (folio_is_device_coherent(folio)) { /* * Migration will fail if the folio is pinned, so * convert the pin on the source folio to a normal * reference. */ pofs_clear_entry(pofs, i); folio_get(folio); gup_put_folio(folio, 1, FOLL_PIN); if (migrate_device_coherent_folio(folio)) { ret = -EBUSY; goto err; } continue; } /* * We can't migrate folios with unexpected references, so drop * the reference obtained by __get_user_pages_locked(). * Migrating folios have been added to movable_folio_list after * calling folio_isolate_lru() which takes a reference so the * folio won't be freed if it's migrating. */ unpin_folio(folio); pofs_clear_entry(pofs, i); } if (!list_empty(movable_folio_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, .reason = MR_LONGTERM_PIN, }; if (migrate_pages(movable_folio_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; goto err; } } putback_movable_pages(movable_folio_list); return -EAGAIN; err: pofs_unpin(pofs); putback_movable_pages(movable_folio_list); return ret; } static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); collect_longterm_unpinnable_folios(&movable_folio_list, pofs); if (list_empty(&movable_folio_list)) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); } /* * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * * Return values: * * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. * * -EAGAIN: if any folios in the range are not allowed to be pinned, then this * routine will migrate those folios away, unpin all the folios in the range. If * migration of the entire set of folios succeeds, then -EAGAIN is returned. The * caller should re-pin the entire range with FOLL_PIN and then call this * routine again. * * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this * indicates a migration failure. The caller should give up, and propagate the * error back up the call stack. The caller does not need to unpin any folios in * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { struct pages_or_folios pofs = { .folios = folios, .has_folios = true, .nr_entries = nr_folios, }; return check_and_migrate_movable_pages_or_folios(&pofs); } /* * Return values and behavior are the same as those for * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { struct pages_or_folios pofs = { .pages = pages, .has_folios = false, .nr_entries = nr_pages, }; return check_and_migrate_movable_pages_or_folios(&pofs); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) { return 0; } static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) { return 0; } #endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which * allows us to process the FOLL_LONGTERM flag. */ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, struct page **pages, int *locked, unsigned int gup_flags) { unsigned int flags; long rc, nr_pinned_pages; if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, pages, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; break; } /* FOLL_LONGTERM implies FOLL_PIN */ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); return rc ? rc : nr_pinned_pages; } /* * Check that the given flags are valid for the exported gup/pup interface, and * update them with the required flags that the caller must have set. */ static bool is_valid_gup_args(struct page **pages, int *locked, unsigned int *gup_flags_p, unsigned int to_set) { unsigned int gup_flags = *gup_flags_p; /* * These flags not allowed to be specified externally to the gup * interfaces: * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) return false; gup_flags |= to_set; if (locked) { /* At the external interface locked must be set */ if (WARN_ON_ONCE(*locked != 1)) return false; gup_flags |= FOLL_UNLOCKABLE; } /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return false; /* LONGTERM can only be specified when pinning */ if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) return false; /* Pages input must be given if using GET/PIN */ if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) return false; /* We want to allow the pgmap to be hot-unplugged at all times */ if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))) return false; *gup_flags_p = gup_flags; return true; } #ifdef CONFIG_MMU /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Returns either number of pages pinned (which may be less than the * number requested), or an error. Details about the return value: * * -- If nr_pages is 0, returns 0. * -- If nr_pages is >0, but no pages were pinned, returns -errno. * -- If nr_pages is >0, and some pages were pinned, returns the number of * pages pinned. Again, this may be less than nr_pages. * * The caller is responsible for releasing returned @pages, via put_page(). * * Must be called with mmap_lock held for read or write. * * get_user_pages_remote walks a process's page tables and takes a reference * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re-faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page * contains data that was valid *at some point in time*. Typically, an IO * or similar operation cannot guarantee anything stronger anyway because * locks can't be held over the syscall boundary. * * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * * get_user_pages_remote is typically used for fewer-copy IO operations, * to get a handle on the memory by some means other than accesses * via the user virtual addresses. The pages may be submitted for * DMA to devices or accessed via their kernel linear mapping (via the * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_TOUCH | FOLL_REMOTE)) return -EINVAL; return __get_user_pages_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_remote); #else /* CONFIG_MMU */ long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { return 0; } #endif /* !CONFIG_MMU */ /** * get_user_pages() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. * * This is the same as get_user_pages_remote(), just with a less-flexible * calling convention where we assume that the mm being operated on belongs to * the current task, and doesn't allow passing of a locked parameter. We also * obviously don't pass FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages); /* * get_user_pages_unlocked() is suitable to replace the form: * * mmap_read_lock(mm); * get_user_pages(mm, ..., pages, NULL); * mmap_read_unlock(mm); * * with: * * get_user_pages_unlocked(mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so * get_user_pages_fast should be used instead if specific gup_flags * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH | FOLL_UNLOCKABLE)) return -EINVAL; return __get_user_pages_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(get_user_pages_unlocked); /* * GUP-fast * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be * protected from page table pages being freed from under it, and should * block any THP splits. * * One way to achieve this is to have the walker disable interrupts, and * rely on IPIs from the TLB flushing code blocking before the page table * pages are freed. This is unsuitable for architectures that do not need * to broadcast an IPI when invalidating TLBs. * * Another way to achieve this is to batch up page table containing pages * belonging to more than one mm_user, then rcu_sched a callback to free those * pages. Disabling interrupts will allow the gup_fast() walker to both block * the rcu_sched callback, and an IPI that we broadcast for splitting THPs * (which is a relatively rare event). The code below adopts this strategy. * * Before activating this code, please be aware that the following assumptions * are currently made: * * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * * *) valid user addesses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ #ifdef CONFIG_HAVE_GUP_FAST /* * Used in the GUP-fast path to determine whether GUP is permitted to work on * a specific folio. * * This call assumes the caller has pinned the folio, that the lowest page table * level still points to this folio, and that interrupts have been disabled. * * GUP-fast must reject all secretmem folios. * * Writing to pinned file-backed dirty tracked folios is inherently problematic * (see comment describing the writable_file_mapping_allowed() function). We * therefore try to avoid the most egregious case of a long-term mapping doing * so. * * This function cannot be as thorough as that one as the VMA is not available * in the fast path, so instead we whitelist known good cases and if in doubt, * fall back to the slow path. */ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) { bool reject_file_backed = false; struct address_space *mapping; bool check_secretmem = false; unsigned long mapping_flags; /* * If we aren't pinning then no problematic write can occur. A long term * pin is the most egregious case so this is the one we disallow. */ if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) == (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) reject_file_backed = true; /* We hold a folio reference, so we can safely access folio fields. */ /* secretmem folios are always order-0 folios. */ if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio)) check_secretmem = true; if (!reject_file_backed && !check_secretmem) return true; if (WARN_ON_ONCE(folio_test_slab(folio))) return false; /* hugetlb neither requires dirty-tracking nor can be secretmem. */ if (folio_test_hugetlb(folio)) return true; /* * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods * cannot proceed, which means no actions performed under RCU can * proceed either. * * inodes and thus their mappings are freed under RCU, which means the * mapping cannot be freed beneath us and thus we can safely dereference * it. */ lockdep_assert_irqs_disabled(); /* * However, there may be operations which _alter_ the mapping, so ensure * we read it once and only once. */ mapping = READ_ONCE(folio->mapping); /* * The mapping may have been truncated, in any case we cannot determine * if this mapping is safe - fall back to slow path to determine how to * proceed. */ if (!mapping) return false; /* Anonymous folios pose no problem. */ mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; if (mapping_flags) return mapping_flags & PAGE_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an * address_space object. */ if (check_secretmem && secretmem_mapping(mapping)) return false; /* The only remaining allowed file system is shmem. */ return !reject_file_backed || shmem_mapping(mapping); } static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) { while ((*nr) - nr_start) { struct folio *folio = page_folio(pages[--(*nr)]); folio_clear_referenced(folio); gup_put_folio(folio, 1, flags); } } #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL /* * GUP-fast relies on pte change detection to avoid concurrent pgtable * operations. * * To pin the page, GUP-fast needs to do below in order: * (1) pin the page (by prefetching pte), then (2) check pte not changed. * * For the rest of pgtable operations where pgtable updates can be racy * with GUP-fast, we need to do (1) clear pte, then (2) check whether page * is pinned. * * Above will work for all pte-level operations, including THP split. * * For THP collapse, it's a bit more complicated because GUP-fast may be * walking a pgtable page that is being freed (pte is still valid but pmd * can be cleared already). To avoid race in such condition, we need to * also check pmd here to make sure pmd doesn't change (corresponds to * pmdp_collapse_flush() in the THP collapse code path). */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; ptem = ptep = pte_offset_map(&pmd, addr); if (!ptep) return 0; do { pte_t pte = ptep_get_lockless(ptep); struct page *page; struct folio *folio; /* * Always fallback to ordinary GUP on PROT_NONE-mapped pages: * pte_access_permitted() better should reject these pages * either way: otherwise, GUP-fast might succeed in * cases where ordinary GUP would fail due to VMA access * permissions. */ if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { if (unlikely(flags & FOLL_LONGTERM)) goto pte_unmap; pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); goto pte_unmap; } } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); folio = try_grab_folio_fast(page, 1, flags); if (!folio) goto pte_unmap; if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } /* * We need to make the page accessible if and only if we are * going to access its content (the FOLL_PIN case). Please * see Documentation/core-api/pin_user_pages.rst for * details. */ if (flags & FOLL_PIN) { ret = arch_make_folio_accessible(folio); if (ret) { gup_put_folio(folio, 1, flags); goto pte_unmap; } } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; } while (ptep++, addr += PAGE_SIZE, addr != end); ret = 1; pte_unmap: if (pgmap) put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } #else /* * If we can't determine whether or not a pte is special, then fail immediately * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_fast_pmd_leaf even if we can't operate on ptes. */ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { return 0; } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; do { struct folio *folio; struct page *page = pfn_to_page(pfn); pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } folio = try_grab_folio_fast(page, 1, flags); if (!folio) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); break; } folio_set_referenced(folio); pages[*nr] = page; (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); return addr == end; } static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long fault_pfn; int nr_start = *nr; fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr)) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages); return 0; } return 1; } #else static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { BUILD_BUG(); return 0; } #endif static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_special(orig)) return 0; if (pmd_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags, pages, nr); } page = pmd_page(orig); refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *page; struct folio *folio; int refs; if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_special(orig)) return 0; if (pud_devmap(orig)) { if (unlikely(flags & FOLL_LONGTERM)) return 0; return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags, pages, nr); } page = pud_page(orig); refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; if (unlikely(pud_val(orig) != pud_val(*pudp))) { gup_put_folio(folio, refs, flags); return 0; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *page; struct folio *folio; if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); page = pgd_page(orig); refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr); folio = try_grab_folio_fast(page, refs, flags); if (!folio) return 0; if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { gup_put_folio(folio, refs, flags); return 0; } if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } if (!gup_fast_folio_allowed(folio, flags)) { gup_put_folio(folio, refs, flags); return 0; } *nr += refs; folio_set_referenced(folio); return 1; } static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) return 0; if (unlikely(pmd_leaf(pmd))) { /* See gup_fast_pte_range() */ if (pmd_protnone(pmd)) return 0; if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); return 1; } static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; if (unlikely(pud_leaf(pud))) { if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); next = p4d_addr_end(addr, end); if (!p4d_present(p4d)) return 0; BUILD_BUG_ON(p4d_leaf(p4d)); if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); return 1; } static void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; pgdp = pgd_offset(current->mm, addr); do { pgd_t pgd = READ_ONCE(*pgdp); next = pgd_addr_end(addr, end); if (pgd_none(pgd)) return; if (unlikely(pgd_leaf(pgd))) { if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } #else static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { } #endif /* CONFIG_HAVE_GUP_FAST */ #ifndef gup_fast_permitted /* * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) { return true; } #endif static unsigned long gup_fast(unsigned long start, unsigned long end, unsigned int gup_flags, struct page **pages) { unsigned long flags; int nr_pinned = 0; unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) || !gup_fast_permitted(start, end)) return 0; if (gup_flags & FOLL_PIN) { if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq)) return 0; } /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. * * With interrupts disabled, we block page table pages from being freed * from under us. See struct mmu_table_batch comments in * include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock() here as we also want to block IPIs * that come from THPs splitting. */ local_irq_save(flags); gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); /* * When pinning pages for DMA there could be a concurrent write protect * from fork() via copy_page_range(), in this case always fail GUP-fast. */ if (gup_flags & FOLL_PIN) { if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) { gup_fast_unpin_user_pages(pages, nr_pinned); return 0; } else { sanity_check_pinned_pages(pages, nr_pinned); } } return nr_pinned; } static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long len, end; unsigned long nr_pinned; int locked = 0; int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) mm_set_has_pinned_flag(&current->mm->flags); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(&current->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; len = nr_pages << PAGE_SHIFT; if (check_add_overflow(start, len, &end)) return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) return nr_pinned; /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, pages, &locked, gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so * returning -errno is not an option */ if (nr_pinned) return nr_pinned; return ret; } return ret + nr_pinned; } /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * * If the architecture does not support this function, simply return with no * pages pinned. * * Careful, careful! COW breaking can go either way, so a non-write * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, * because gup fast is always a "pin with a +1 page refcount" request. * * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET | FOLL_FAST_ONLY)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Attempt to pin user pages in memory without taking mm->mmap_lock. * If not successful, it will fall back to taking the lock and * calling get_user_pages(). * * Returns number of pages pinned. This may be fewer than the number requested. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns * -errno. */ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { /* * The caller may or may not have explicitly set FOLL_GET; either way is * OK. However, internally (within mm/gup.c), gup fast variants must set * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" * request. */ if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); /** * pin_user_pages_fast() - pin user pages in memory without taking locks * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See * get_user_pages_fast() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return -EINVAL; return gup_fast_fallback(start, nr_pages, gup_flags, pages); } EXPORT_SYMBOL_GPL(pin_user_pages_fast); /** * pin_user_pages_remote() - pin pages of a remote process * * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @locked: pointer to lock flag indicating whether lock is held and * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See * get_user_pages_remote() for documentation on the function arguments, because * the arguments here are identical. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { int local_locked = 1; if (!is_valid_gup_args(pages, locked, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) return 0; return __gup_longterm_locked(mm, start, nr_pages, pages, locked ? locked : &local_locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_remote); /** * pin_user_pages() - pin user pages in memory for use by other devices * * @start: starting user address * @nr_pages: number of pages from start to pin * @gup_flags: flags modifying lookup behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and * FOLL_PIN is set. * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { int locked = 1; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages); /* * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. * * Note that if a zero_page is amongst the returned pages, it will not have * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) { int locked = 0; if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) return 0; return __gup_longterm_locked(current->mm, start, nr_pages, pages, &locked, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); /** * memfd_pin_folios() - pin folios associated with a memfd * @memfd: the memfd whose folios are to be pinned * @start: the first memfd offset * @end: the last memfd offset (inclusive) * @folios: array that receives pointers to the folios pinned * @max_folios: maximum number of entries in @folios * @offset: the offset into the first folio * * Attempt to pin folios associated with a memfd in the contiguous range * [start, end]. Given that a memfd is either backed by shmem or hugetlb, * the folios can either be found in the page cache or need to be allocated * if necessary. Once the folios are located, they are all pinned via * FOLL_PIN and @offset is populatedwith the offset into the first folio. * And, eventually, these pinned folios must be released either using * unpin_folios() or unpin_folio(). * * It must be noted that the folios may be pinned for an indefinite amount * of time. And, in most cases, the duration of time they may stay pinned * would be controlled by the userspace. This behavior is effectively the * same as using FOLL_LONGTERM with other GUP APIs. * * Returns number of folios pinned, which could be less than @max_folios * as it depends on the folio sizes that cover the range [start, end]. * If no folios were pinned, it returns -errno. */ long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset) { unsigned int flags, nr_folios, nr_found; unsigned int i, pgshift = PAGE_SHIFT; pgoff_t start_idx, end_idx, next_idx; struct folio *folio = NULL; struct folio_batch fbatch; struct hstate *h; long ret = -EINVAL; if (start < 0 || start > end || !max_folios) return -EINVAL; if (!memfd) return -EINVAL; if (!shmem_file(memfd) && !is_file_hugepages(memfd)) return -EINVAL; if (end >= i_size_read(file_inode(memfd))) return -EINVAL; if (is_file_hugepages(memfd)) { h = hstate_file(memfd); pgshift = huge_page_shift(h); } flags = memalloc_pin_save(); do { nr_folios = 0; start_idx = start >> pgshift; end_idx = end >> pgshift; if (is_file_hugepages(memfd)) { start_idx <<= huge_page_order(h); end_idx <<= huge_page_order(h); } folio_batch_init(&fbatch); while (start_idx <= end_idx && nr_folios < max_folios) { /* * In most cases, we should be able to find the folios * in the page cache. If we cannot find them for some * reason, we try to allocate them and add them to the * page cache. */ nr_found = filemap_get_folios_contig(memfd->f_mapping, &start_idx, end_idx, &fbatch); if (folio) { folio_put(folio); folio = NULL; } next_idx = 0; for (i = 0; i < nr_found; i++) { /* * As there can be multiple entries for a * given folio in the batch returned by * filemap_get_folios_contig(), the below * check is to ensure that we pin and return a * unique set of folios between start and end. */ if (next_idx && next_idx != folio_index(fbatch.folios[i])) continue; folio = page_folio(&fbatch.folios[i]->page); if (try_grab_folio(folio, 1, FOLL_PIN)) { folio_batch_release(&fbatch); ret = -EINVAL; goto err; } if (nr_folios == 0) *offset = offset_in_folio(folio, start); folios[nr_folios] = folio; next_idx = folio_next_index(folio); if (++nr_folios == max_folios) break; } folio = NULL; folio_batch_release(&fbatch); if (!nr_found) { folio = memfd_alloc_folio(memfd, start_idx); if (IS_ERR(folio)) { ret = PTR_ERR(folio); if (ret != -EEXIST) goto err; folio = NULL; } } } ret = check_and_migrate_movable_folios(nr_folios, folios); } while (ret == -EAGAIN); memalloc_pin_restore(flags); return ret ? ret : nr_folios; err: memalloc_pin_restore(flags); unpin_folios(folios, nr_folios); return ret; } EXPORT_SYMBOL_GPL(memfd_pin_folios); /** * folio_add_pins() - add pins to an already-pinned folio * @folio: the folio to add more pins to * @pins: number of pins to add * * Try to add more pins to an already-pinned folio. The semantics * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot * be changed. * * This function is helpful when having obtained a pin on a large folio * using memfd_pin_folios(), but wanting to logically unpin parts * (e.g., individual pages) of the folio later, for example, using * unpin_user_page_range_dirty_lock(). * * This is not the right interface to initially pin a folio. */ int folio_add_pins(struct folio *folio, unsigned int pins) { VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio)); return try_grab_folio(folio, pins, FOLL_PIN); } EXPORT_SYMBOL_GPL(folio_add_pins);
8 1 12 252 19 18 119 4089 11 1 1 8 46 432 2702 2707 2707 106 2677 2708 2 1797 300 437 794 1453 752 95 437 455 123 632 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ #define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; union __packed { __u8 flags; struct __packed { #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, routeraddr : 1, preferpd : 1, reserved : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 4, preferpd : 1, routeraddr : 1, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif }; }; __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; /* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */ static_assert(sizeof(struct prefix_info) == 32); #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; struct netlink_ext_ack *extack; }; struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; u8 ifa_proto; const struct in6_addr *peer_pfx; u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; u16 scope; }; enum addr_type_t { UNICAST_ADDR, MULTICAST_ADDR, ANYCAST_ADDR, }; struct inet6_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; int ifindex; enum addr_type_t type; bool force_rt_scope_universe; }; int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, unsigned char nsegs); bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } #define INFINITY_LIFE_TIME 0xFFFFFFFF static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == INFINITY_LIFE_TIME) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ static inline bool ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) return false; return pskb_may_pull(skb, len); } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); int ipv6_anycast_init(void); void ipv6_anycast_cleanup(void); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); } /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device * @skb: skb for original incoming interface if needed * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, const struct sk_buff *skb) { if (netif_is_l3_master(dev)) dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); return __in6_dev_get(dev); } /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device * * This is a safer version of __in6_dev_get */ static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) { if (likely(dev)) return rcu_dereference_rtnl(dev->ip6_ptr); else return NULL; } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } /* called with rcu_read_lock held */ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); if (unlikely(!idev)) return true; return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp) { return refcount_inc_not_zero(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif int inet6_fill_ifmcaddr(struct sk_buff *skb, const struct ifmcaddr6 *ifmca, struct inet6_fill_args *args); int inet6_fill_ifacaddr(struct sk_buff *skb, const struct ifacaddr6 *ifaca, struct inet6_fill_args *args); #endif
11 11 20 1 2 4 13 2 12 1 11 7 7 4 7 4 5 5 69 70 14 8 47 47 48 48 8 7 36 43 36 7 29 14 70 2 1 1 6 6 2 4 6 6 1 2 1 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" #include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ BPF_F_STACK_BUILD_ID) struct stack_map_bucket { struct pcpu_freelist_node fnode; u32 hash; u32 nr; u64 data[]; }; struct bpf_stack_map { struct bpf_map map; void *elems; struct pcpu_freelist freelist; u32 n_buckets; struct stack_map_bucket *buckets[] __counted_by(n_buckets); }; static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); } static inline int stack_map_data_size(struct bpf_map *map) { return stack_map_use_build_id(map) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); } static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u64 elem_size = sizeof(struct stack_map_bucket) + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, smap->map.numa_node); if (!smap->elems) return -ENOMEM; err = pcpu_freelist_init(&smap->freelist); if (err) goto free_elems; pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, smap->map.max_entries); return 0; free_elems: bpf_map_area_free(smap->elems); return err; } /* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; u64 cost, n_buckets; int err; if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || value_size < 8 || value_size % 8) return ERR_PTR(-EINVAL); BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); if (attr->map_flags & BPF_F_STACK_BUILD_ID) { if (value_size % sizeof(struct bpf_stack_build_id) || value_size / sizeof(struct bpf_stack_build_id) > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); } else if (value_size / 8 > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); /* hash table size must be power of 2; roundup_pow_of_two() can overflow * into UB on 32-bit arches, so check that first */ if (attr->max_entries > 1UL << 31) return ERR_PTR(-E2BIG); n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); smap->n_buckets = n_buckets; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; return &smap->map; put_buffers: put_callchain_buffers(); free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) { return may_fault ? build_id_parse(vma, build_id, NULL) : build_id_parse_nofault(vma, build_id, NULL); } /* * Expects all id_offs[i].ip values to be set to correct initial IPs. * They will be subsequently: * - either adjusted in place to a file offset, if build ID fetching * succeeds; in this case id_offs[i].build_id is set to correct build ID, * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID; * - or IP will be kept intact, if build ID fetching failed; in this case * id_offs[i].build_id is zeroed out and id_offs[i].status is set to * BPF_STACK_BUILD_ID_IP. */ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u32 trace_nr, bool user, bool may_fault) { int i; struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma, *prev_vma = NULL; const char *prev_build_id; /* If the irq_work is in use, fall back to report ips. Same * fallback is used for kernel stack (!user) on a stackmap with * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { u64 ip = READ_ONCE(id_offs[i].ip); if (range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); goto build_id_valid; } vma = find_vma(current->mm, ip); if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; prev_vma = vma; prev_build_id = id_offs[i].build_id; } bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is * necessary to fix this mismatch. */ if (__BITS_PER_LONG != 64) { unsigned long *from = (unsigned long *) entry->ip; u64 *to = entry->ip; int i; /* copy data from the end to avoid using extra buffer */ for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } put_callchain_entry(rctx); return entry; #else /* CONFIG_STACKTRACE */ return NULL; #endif } static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) return id; if (stack_map_use_build_id(map)) { struct bpf_stack_build_id *id_offs; /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; new_bucket->nr = trace_nr; id_offs = (struct bpf_stack_build_id *)new_bucket->data; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return id; } if (bucket && !(flags & BPF_F_REUSE_STACKID)) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return -EEXIST; } } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) return id; if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->data, ips, trace_len); } new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return id; } BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; max_depth += skip; if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; return __bpf_get_stackid(map, trace, flags); } const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static __u64 count_kernel_ip(struct perf_callchain_entry *trace) { __u64 nr_kernel = 0; while (nr_kernel < trace->nr) { if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) break; nr_kernel++; } return nr_kernel; } BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, struct bpf_map *, map, u64, flags) { struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; __u64 nr_kernel; int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; user = flags & BPF_F_USER_STACK; kernel = !user; trace = ctx->data->callchain; if (unlikely(!trace)) return -EFAULT; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } return ret; } const struct bpf_func_proto bpf_get_stackid_proto_pe = { .func = bpf_get_stackid_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags, bool may_fault) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; bool crosstask = task && task != current; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; int err = -EINVAL; u64 *ips; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; if (kernel && user_build_id) goto clear; elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; /* cannot get valid user stack for task without user_mode regs */ if (task && user && !user_mode(regs)) goto err_fault; /* get_perf_callchain does not support crosstask user stack walking * but returns an empty stack instead of NULL. */ if (crosstask && user) { err = -EOPNOTSUPP; goto clear; } num_elem = size / elem_size; max_depth = num_elem + skip; if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; if (may_fault) rcu_read_lock(); /* need RCU for perf's callchain below */ if (trace_in) trace = trace_in; else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) rcu_read_unlock(); goto err_fault; } trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; if (user_build_id) { struct bpf_stack_build_id *id_offs = buf; u32 i; for (i = 0; i < trace_nr; i++) id_offs[i].ip = ips[i]; } else { memcpy(buf, ips, copy_len); } /* trace/ips should not be dereferenced after this point */ if (may_fault) rcu_read_unlock(); if (user_build_id) stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; err_fault: err = -EFAULT; clear: memset(buf, 0, size); return err; } BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); } const struct bpf_func_proto bpf_get_stack_sleepable_proto = { .func = bpf_get_stack_sleepable, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags, bool may_fault) { struct pt_regs *regs; long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); if (regs) res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); put_task_stack(task); return res; } BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, u32, size, u64, flags) { return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); } const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { .func = bpf_get_task_stack_sleepable, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = (struct pt_regs *)(ctx->regs); struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; int err = -EINVAL; __u64 nr_kernel; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; user = flags & BPF_F_USER_STACK; kernel = !user; err = -EFAULT; trace = ctx->data->callchain; if (unlikely(!trace)) goto clear; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); } return err; clear: memset(buf, 0, size); return err; } const struct bpf_func_proto bpf_get_stack_proto_pe = { .func = bpf_get_stack_pe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; /* Called from eBPF program */ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EOPNOTSUPP); } /* Called from syscall */ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; u32 id = *(u32 *)key, trace_len; if (unlikely(id >= smap->n_buckets)) return -ENOENT; bucket = xchg(&smap->buckets[id], NULL); if (!bucket) return -ENOENT; trace_len = bucket->nr * stack_map_data_size(map); memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u32 id; WARN_ON_ONCE(!rcu_read_lock_held()); if (!key) { id = 0; } else { id = *(u32 *)key; if (id >= smap->n_buckets || !smap->buckets[id]) id = 0; else id++; } while (id < smap->n_buckets && !smap->buckets[id]) id++; if (id >= smap->n_buckets) return -ENOENT; *(u32 *)next_key = id; return 0; } static long stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static long stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; u32 id = *(u32 *)key; if (unlikely(id >= smap->n_buckets)) return -E2BIG; old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; } else { return -ENOENT; } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); bpf_map_area_free(smap->elems); pcpu_freelist_destroy(&smap->freelist); bpf_map_area_free(smap); put_callchain_buffers(); } static u64 stack_map_mem_usage(const struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u64 value_size = map->value_size; u64 n_buckets = smap->n_buckets; u64 enties = map->max_entries; u64 usage = sizeof(*smap); usage += n_buckets * sizeof(struct stack_map_bucket *); usage += enties * (sizeof(struct stack_map_bucket) + value_size); return usage; } BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map) const struct bpf_map_ops stack_trace_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = stack_map_mem_usage, .map_btf_id = &stack_trace_map_btf_ids[0], };
12 1 4501 4502 991 1993 7 3 2611 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { refcount_t usage; int ngroups; kgid_t gid[]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { refcount_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (refcount_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_long_t usage; kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct ucounts *ucounts; struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); extern int set_cred_ucounts(struct cred *); static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } static inline const struct cred *override_creds(const struct cred *override_cred) { return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { return rcu_replace_pointer(current->cred, revert_cred, 1); } /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference * @nr: Number of references to acquire * * Get references on the specified set of credentials. The caller must release * all acquired reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; nonconst_cred->non_rcu = 0; atomic_long_add(nr, &nonconst_cred->usage); return cred; } /* * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. */ static inline const struct cred *get_cred(const struct cred *cred) { return get_cred_many(cred, 1); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_long_inc_not_zero(&nonconst_cred->usage)) return NULL; nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred_many(const struct cred *_cred, int nr) { struct cred *cred = (struct cred *) _cred; if (cred) { if (atomic_long_sub_and_test(nr, &cred->usage)) __put_cred(cred); } } /* * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. */ static inline void put_cred(const struct cred *cred) { put_cred_many(cred, 1); } /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define task_ucounts(task) (task_cred_xxx((task), ucounts)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) #define current_ucounts() (current_cred_xxx(ucounts)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */
13 1 11 11 1 1 11 11 11 1 2 2 10 10 10 1 6 5 10 10 1 1 1 2 2 2 2 1 1 1 1 1 1 2 62 61 60 5 29 1 3 32 27 86 87 2 2 23 10 15 4 12 2 12 2 13 2 2 2 2 18 18 15 6 2 5 16 4 4 23 20 3 3 3 3 20 18 4 4 4 23 17 76 76 54 23 23 23 18 17 23 20 23 18 18 1 1 7 16 4 1 7 2 4 5 7 10 10 10 6 8 3 10 6 7 10 13 4 13 1 1 69 69 44 25 10 1 1 1 1 1 1 1 1 10 9 2 1 1 97 91 32 1 3 4 4 97 10 7 5 12 10 7 86 12 2 87 12 2 2 2 3 2 1 17 95 4 97 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u\n", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYN; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_SYNACK; mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u\n", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { mp_opt->suboptions |= OPTION_MPTCP_MPJ_ACK; ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac\n"); } break; case MPTCPOPT_DSS: pr_debug("DSS\n"); ptr++; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d\n", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu\n", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u\n", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d\n", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d\n", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d\n", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu\n", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u\n", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu\n", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* Ensure that casting the whole status to u32 is efficient and safe */ BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), sizeof(u32))); *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u\n", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (READ_ONCE(subflow->fully_established) || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d\n", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p\n", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; struct mptcp_addr_info addr; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; /* * Later on, mptcp_write_options() will enforce mutually exclusion with * DSS, bail out if such option is set and we can't drop it. */ if (drop_other_suboptions) remaining += opt_size; else if (opts->suboptions & OPTION_MPTCP_DSS) return false; len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions\n"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d\n", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d\n", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d\n", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu\n", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu\n", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu\n", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->request_bkup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u\n", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(READ_ONCE(subflow->fully_established))) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && (!mp_opt->echo || subflow->mp_join)))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; pr_fallback(msk); mptcp_do_fallback(ssk); return false; } if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; WRITE_ONCE(msk->snd_una, new_snd_una); } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } msk->last_ack_recv = tcp_jiffies32; mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false if a subflow has been reset, else return true */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return true; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } goto update_wspace; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
10 10 10 10 2 10 2 8 11 11 1 10 23 23 2 2 15 15 15 15 26 26 25 3 10 1 1 1 7 7 3 3 1 4 26 26 1 1 6 18 24 15 12 7 10 7 6 9 9 9 9 2 1 14 3 1 1 4 4 1 3 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 // SPDX-License-Identifier: GPL-2.0-only /* * Berkeley Packet Filter based traffic classifier * * Might be used to classify traffic through flexible, user-defined and * possibly JIT-ed BPF filters for traffic control as an alternative to * ematches. * * (C) 2013 Daniel Borkmann <dborkman@redhat.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/idr.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/tc_wrapper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 #define CLS_BPF_SUPPORTED_GEN_FLAGS \ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW) struct cls_bpf_head { struct list_head plist; struct idr handle_idr; struct rcu_head rcu; }; struct cls_bpf_prog { struct bpf_prog *filter; struct list_head link; struct tcf_result res; bool exts_integrated; u32 gen_flags; unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; struct rcu_work rwork; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; static int cls_bpf_exec_opcode(int code) { switch (code) { case TC_ACT_OK: case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_TRAP: case TC_ACT_REDIRECT: case TC_ACT_UNSPEC: return code; default: return TC_ACT_UNSPEC; } } TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); bool at_ingress = skb_at_tc_ingress(skb); struct cls_bpf_prog *prog; int ret = -1; list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; qdisc_skb_cb(skb)->tc_classid = prog->res.classid; if (tc_skip_sw(prog->gen_flags)) { filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0; } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; if (prog->exts_integrated) { res->class = 0; res->classid = TC_H_MAJ(prog->res.classid) | qdisc_skb_cb(skb)->tc_classid; ret = cls_bpf_exec_opcode(filter_res); if (ret == TC_ACT_UNSPEC) continue; break; } if (filter_res == 0) continue; if (filter_res != -1) { res->class = 0; res->classid = filter_res; } else { *res = prog->res; } ret = tcf_exts_exec(skb, &prog->exts, res); if (ret < 0) continue; break; } return ret; } static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) { return !prog->bpf_ops; } static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; struct cls_bpf_prog *obj; bool skip_sw; int err; skip_sw = prog && tc_skip_sw(prog->gen_flags); obj = prog ?: oldprog; tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, extack); cls_bpf.command = TC_CLSBPF_OFFLOAD; cls_bpf.exts = &obj->exts; cls_bpf.prog = prog ? prog->filter : NULL; cls_bpf.oldprog = oldprog ? oldprog->filter : NULL; cls_bpf.name = obj->bpf_name; cls_bpf.exts_integrated = obj->exts_integrated; if (oldprog && prog) err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &oldprog->gen_flags, &oldprog->in_hw_count, &prog->gen_flags, &prog->in_hw_count, true); else if (prog) err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &prog->gen_flags, &prog->in_hw_count, true); else err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &oldprog->gen_flags, &oldprog->in_hw_count, true); if (prog && err) { cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } static u32 cls_bpf_flags(u32 flags) { return flags & CLS_BPF_SUPPORTED_GEN_FLAGS; } static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog, struct netlink_ext_ack *extack) { if (prog && oldprog && cls_bpf_flags(prog->gen_flags) != cls_bpf_flags(oldprog->gen_flags)) return -EINVAL; if (prog && tc_skip_hw(prog->gen_flags)) prog = NULL; if (oldprog && tc_skip_hw(oldprog->gen_flags)) oldprog = NULL; if (!prog && !oldprog) return 0; return cls_bpf_offload_cmd(tp, prog, oldprog, extack); } static void cls_bpf_stop_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { int err; err = cls_bpf_offload_cmd(tp, NULL, prog, extack); if (err) pr_err("Stopping hardware offload failed: %d\n", err); } static void cls_bpf_offload_update_stats(struct tcf_proto *tp, struct cls_bpf_prog *prog) { struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, NULL); cls_bpf.command = TC_CLSBPF_STATS; cls_bpf.exts = &prog->exts; cls_bpf.prog = prog->filter; cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true); } static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } static void cls_bpf_free_parms(struct cls_bpf_prog *prog) { if (cls_bpf_is_ebpf(prog)) bpf_prog_put(prog->filter); else bpf_prog_destroy(prog->filter); kfree(prog->bpf_name); kfree(prog->bpf_ops); } static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); tcf_exts_put_net(&prog->exts); cls_bpf_free_parms(prog); kfree(prog); } static void cls_bpf_delete_prog_work(struct work_struct *work) { struct cls_bpf_prog *prog = container_of(to_rcu_work(work), struct cls_bpf_prog, rwork); rtnl_lock(); __cls_bpf_delete_prog(prog); rtnl_unlock(); } static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); idr_remove(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog, extack); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); if (tcf_exts_get_net(&prog->exts)) tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work); else __cls_bpf_delete_prog(prog); } static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); __cls_bpf_delete(tp, arg, extack); *last = list_empty(&head->plist); return 0; } static void cls_bpf_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog, extack); idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } static void *cls_bpf_get(struct tcf_proto *tp, u32 handle) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; list_for_each_entry(prog, &head->plist, link) { if (prog->handle == handle) return prog; } return NULL; } static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; int ret; bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); if (bpf_size != nla_len(tb[TCA_BPF_OPS])) return -EINVAL; bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; ret = bpf_prog_create(&fp, &fprog_tmp); if (ret < 0) { kfree(bpf_ops); return ret; } prog->bpf_ops = bpf_ops; prog->bpf_num_ops = bpf_num_ops; prog->bpf_name = NULL; prog->filter = fp; return 0; } static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, u32 gen_flags, const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); if (tb[TCA_BPF_NAME]) { name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL); if (!name) { bpf_prog_put(fp); return -ENOMEM; } } prog->bpf_ops = NULL; prog->bpf_name = name; prog->filter = fp; if (fp->dst_needed) tcf_block_netif_keep_dst(tp->chain->block); return 0; } static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); bool is_bpf, is_ebpf, have_exts = false; struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; bool bound_to_filter = false; struct cls_bpf_prog *prog; u32 gen_flags = 0; int ret; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy, NULL); if (ret < 0) return ret; prog = kzalloc(sizeof(*prog), GFP_KERNEL); if (!prog) return -ENOBUFS; ret = tcf_exts_init(&prog->exts, net, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { ret = -EINVAL; goto errout; } } if (handle == 0) { handle = 1; ret = idr_alloc_u32(&head->handle_idr, prog, &handle, INT_MAX, GFP_KERNEL); } else if (!oldprog) { ret = idr_alloc_u32(&head->handle_idr, prog, &handle, handle, GFP_KERNEL); } if (ret) goto errout; prog->handle = handle; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; is_ebpf = tb[TCA_BPF_FD]; if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { ret = -EINVAL; goto errout_idr; } ret = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &prog->exts, flags, extack); if (ret < 0) goto errout_idr; if (tb[TCA_BPF_FLAGS]) { u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { ret = -EINVAL; goto errout_idr; } have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; } if (tb[TCA_BPF_FLAGS_GEN]) { gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || !tc_flags_valid(gen_flags)) { ret = -EINVAL; goto errout_idr; } } prog->exts_integrated = have_exts; prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) goto errout_idr; if (tb[TCA_BPF_CLASSID]) { prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); tcf_bind_filter(tp, &prog->res, base); bound_to_filter = true; } ret = cls_bpf_offload(tp, prog, oldprog, extack); if (ret) goto errout_parms; if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, prog->gen_flags); if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); tcf_exts_get_net(&oldprog->exts); tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work); } else { list_add_rcu(&prog->link, &head->plist); } *arg = prog; return 0; errout_parms: if (bound_to_filter) tcf_unbind_filter(tp, &prog->res); cls_bpf_free_parms(prog); errout_idr: if (!oldprog) idr_remove(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); return ret; } static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { struct nlattr *nla; if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * sizeof(struct sock_filter)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); return 0; } static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; if (nla_put_u32(skb, TCA_BPF_ID, prog->filter->aux->id)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *tm, bool rtnl_held) { struct cls_bpf_prog *prog = fh; struct nlattr *nest; u32 bpf_flags = 0; int ret; if (prog == NULL) return skb->len; tm->tcm_handle = prog->handle; cls_bpf_offload_update_stats(tp, prog); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (prog->res.classid && nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; if (cls_bpf_is_ebpf(prog)) ret = cls_bpf_dump_ebpf_info(prog, skb); else ret = cls_bpf_dump_bpf_info(prog, skb); if (ret) goto nla_put_failure; if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; if (prog->exts_integrated) bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) goto nla_put_failure; if (prog->gen_flags && nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags)) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &prog->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; tc_cls_bind_class(classid, cl, q, &prog->res, base); } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; list_for_each_entry(prog, &head->plist, link) { if (!tc_cls_stats_dump(tp, arg, prog)) break; } } static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; struct cls_bpf_prog *prog; int err; list_for_each_entry(prog, &head->plist, link) { if (tc_skip_hw(prog->gen_flags)) continue; tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, extack); cls_bpf.command = TC_CLSBPF_OFFLOAD; cls_bpf.exts = &prog->exts; cls_bpf.prog = add ? prog->filter : NULL; cls_bpf.oldprog = add ? NULL : prog->filter; cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF, &cls_bpf, cb_priv, &prog->gen_flags, &prog->in_hw_count); if (err) return err; } return 0; } static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, .classify = cls_bpf_classify, .init = cls_bpf_init, .destroy = cls_bpf_destroy, .get = cls_bpf_get, .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; MODULE_ALIAS_NET_CLS("bpf"); static int __init cls_bpf_init_mod(void) { return register_tcf_proto_ops(&cls_bpf_ops); } static void __exit cls_bpf_exit_mod(void) { unregister_tcf_proto_ops(&cls_bpf_ops); } module_init(cls_bpf_init_mod); module_exit(cls_bpf_exit_mod);
427 229 426 427 345 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 /* * memfd_create system call and file sealing support * * Code was originally included in shmem.c, and broken out to facilitate * use by hugetlbfs as well as tmpfs. * * This file is released under the GPL. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/khugepaged.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> #include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> /* * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ static bool memfd_folio_has_extra_refs(struct folio *folio) { return folio_ref_count(folio) - folio_mapcount(folio) != folio_nr_pages(folio); } static void memfd_tag_pins(struct xa_state *xas) { struct folio *folio; int latency = 0; lru_add_drain(); xas_lock_irq(xas); xas_for_each(xas, folio, ULONG_MAX) { if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(xas); xas_unlock_irq(xas); cond_resched(); xas_lock_irq(xas); } xas_unlock_irq(xas); } /* * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). * It is mainly called to allocate a folio in a memfd when the caller * (memfd_pin_folios()) cannot find a folio in the page cache at a given * index in the mapping. */ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { #ifdef CONFIG_HUGETLB_PAGE struct folio *folio; gfp_t gfp_mask; int err; if (is_file_hugepages(memfd)) { /* * The folio would most likely be accessed by a DMA driver, * therefore, we have zone memory constraints where we can * alloc from. Also, the folio will be pinned for an indefinite * amount of time, so it is not expected to be migrated away. */ struct hstate *h = hstate_file(memfd); gfp_mask = htlb_alloc_mask(h); gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); idx >>= huge_page_order(h); folio = alloc_hugetlb_folio_reserve(h, numa_node_id(), NULL, gfp_mask); if (folio) { err = hugetlb_add_to_page_cache(folio, memfd->f_mapping, idx); if (err) { folio_put(folio); return ERR_PTR(err); } folio_unlock(folio); return folio; } return ERR_PTR(-ENOMEM); } #endif return shmem_read_folio(memfd->f_mapping, idx); } /* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); struct folio *folio; int error, scan; memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) lru_add_drain_all(); else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; xas_set(&xas, 0); xas_lock_irq(&xas); xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; else clear = false; } if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); if (++latency < XA_CHECK_SCHED) continue; latency = 0; xas_pause(&xas); xas_unlock_irq(&xas); cond_resched(); xas_lock_irq(&xas); } xas_unlock_irq(&xas); } return error; } static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif return NULL; } #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); unsigned int *file_seals; int error; /* * SEALING * Sealing allows multiple parties to share a tmpfs or hugetlbfs file * but restrict access to a specific subset of file operations. Seals * can only be added, but never removed. This way, mutually untrusted * parties can share common memory regions with a well-defined policy. * A malicious peer can thus never perform unwanted operations on a * shared object. * * Seals are only supported on special tmpfs or hugetlbfs files and * always affect the whole underlying inode. Once a seal is set, it * may prevent some kinds of access to the file. Currently, the * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file * only adds a given set of seals to the file, it never touches * existing seals. Furthermore, the "setting seals"-operation can be * sealed itself, which basically prevents any further seal from being * added. * * Semantics of sealing are only defined on volatile files. Only * anonymous tmpfs and hugetlbfs files support sealing. More * importantly, seals are never written to disk. Therefore, there's * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; inode_lock(inode); file_seals = memfd_file_seals_ptr(file); if (!file_seals) { error = -EINVAL; goto unlock; } if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; } } /* * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; *file_seals |= seals; error = 0; unlock: inode_unlock(inode); return error; } static int memfd_get_seals(struct file *file) { unsigned int *seals = memfd_file_seals_ptr(file); return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { long error; switch (cmd) { case F_ADD_SEALS: error = memfd_add_seals(file, arg); break; case F_GET_SEALS: error = memfd_get_seals(file); break; default: error = -EINVAL; break; } return error; } #define MFD_NAME_PREFIX "memfd:" #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL struct pid_namespace *ns = task_active_pid_ns(current); int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { pr_err_ratelimited( "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif return 0; } static inline bool is_write_sealed(unsigned int seals) { return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } static int check_write_seal(unsigned long *vm_flags_ptr) { unsigned long vm_flags = *vm_flags_ptr; unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) return 0; /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * write seals are active. */ if (mask & VM_WRITE) return -EPERM; /* * This is a read-only mapping, disallow mprotect() from making a * write-sealed mapping writable in future. */ *vm_flags_ptr &= ~VM_MAYWRITE; return 0; } int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); unsigned int seals = seals_ptr ? *seals_ptr : 0; if (is_write_sealed(seals)) err = check_write_seal(vm_flags_ptr); return err; } static int sanitize_flags(unsigned int *flags_ptr) { unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; return check_sysctl_memfd_noexec(flags_ptr); } static char *alloc_name(const char __user *uname) { int error; char *name; long len; name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return ERR_PTR(-ENOMEM); strcpy(name, MFD_NAME_PREFIX); /* returned length does not include terminating zero */ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); if (len < 0) { error = -EFAULT; goto err_name; } else if (len > MFD_NAME_MAX_LEN) { error = -EINVAL; goto err_name; } return name; err_name: kfree(name); return ERR_PTR(error); } static struct file *alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); } else { file = shmem_file_setup(name, 0, VM_NORESERVE); } if (IS_ERR(file)) return file; file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; if (flags & MFD_NOEXEC_SEAL) { struct inode *inode = file_inode(file); inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); if (file_seals) { *file_seals &= ~F_SEAL_SEAL; *file_seals |= F_SEAL_EXEC; } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); if (file_seals) *file_seals &= ~F_SEAL_SEAL; } return file; } SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { struct file *file; int fd, error; char *name; error = sanitize_flags(&flags); if (error < 0) return error; name = alloc_name(uname); if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); if (fd < 0) { error = fd; goto err_name; } file = alloc_file(name, flags); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } fd_install(fd, file); kfree(name); return fd; err_fd: put_unused_fd(fd); err_name: kfree(name); return error; }
2 2 2 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2016 Tomasz Chilinski <tomasz.chilinski@chilan.com> */ /* Kernel module implementing an IP set type: the hash:ip,mac type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <linux/if_ether.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 #define IPSET_TYPE_REV_MAX 1 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Tomasz Chilinski <tomasz.chilinski@chilan.com>"); IP_SET_MODULE_DESC("hash:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,mac"); /* Type specific function prefix */ #define HTYPE hash_ipmac /* IPv4 variant */ /* Member elements */ struct hash_ipmac4_elem { /* Zero valued IP addresses cannot be stored */ __be32 ip; union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_ipmac4_data_equal(const struct hash_ipmac4_elem *e1, const struct hash_ipmac4_elem *e2, u32 *multi) { return e1->ip == e2->ip && ether_addr_equal(e1->ether, e2->ether); } static bool hash_ipmac4_data_list(struct sk_buff *skb, const struct hash_ipmac4_elem *e) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip) || nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmac4_data_next(struct hash_ipmac4_elem *next, const struct hash_ipmac4_elem *e) { next->ip = e->ip; } #define MTYPE hash_ipmac4 #define PF 4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_ipmac4_elem) #include "ip_set_hash_gen.h" static int hash_ipmac4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmac4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac4_elem e = { .ip = 0, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) || ip_set_get_extensions(set, tb, &ext); if (ret) return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } /* IPv6 variant */ /* Member elements */ struct hash_ipmac6_elem { /* Zero valued IP addresses cannot be stored */ union nf_inet_addr ip; union { unsigned char ether[ETH_ALEN]; __be32 foo[2]; }; }; /* Common functions */ static bool hash_ipmac6_data_equal(const struct hash_ipmac6_elem *e1, const struct hash_ipmac6_elem *e2, u32 *multi) { return ipv6_addr_equal(&e1->ip.in6, &e2->ip.in6) && ether_addr_equal(e1->ether, e2->ether); } static bool hash_ipmac6_data_list(struct sk_buff *skb, const struct hash_ipmac6_elem *e) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) || nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, e->ether)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmac6_data_next(struct hash_ipmac6_elem *next, const struct hash_ipmac6_elem *e) { } #undef MTYPE #undef PF #undef HOST_MASK #undef HKEY_DATALEN #define MTYPE hash_ipmac6 #define PF 6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_ipmac6_elem) #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipmac6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac6_elem e = { { .all = { 0 } }, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmac6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmac6_elem e = { { .all = { 0 } }, { .foo[0] = 0, .foo[1] = 0 } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_ETHER] || nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) || !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBMARK) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBPRIO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_SKBQUEUE))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) || ip_set_get_extensions(set, tb, &ext); if (ret) return ret; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); if (is_zero_ether_addr(e.ether)) return -IPSET_ERR_HASH_ELEM; return adtfn(set, &e, &ext, &ext, flags); } static struct ip_set_type hash_ipmac_type __read_mostly = { .name = "hash:ip,mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmac_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipmac_init(void) { return ip_set_type_register(&hash_ipmac_type); } static void __exit hash_ipmac_fini(void) { ip_set_type_unregister(&hash_ipmac_type); } module_init(hash_ipmac_init); module_exit(hash_ipmac_fini);
106 106 106 106 3 106 106 104 6 106 6 104 106 106 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template_lib.c * Library of supported template fields. */ #include "ima_template_lib.h" #include <linux/xattr.h> #include <linux/evm.h> static bool ima_template_hash_algo_allowed(u8 algo) { if (algo == HASH_ALGO_SHA1 || algo == HASH_ALGO_MD5) return true; return false; } enum data_formats { DATA_FMT_DIGEST = 0, DATA_FMT_DIGEST_WITH_ALGO, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, DATA_FMT_STRING, DATA_FMT_HEX, DATA_FMT_UINT }; enum digest_type { DIGEST_TYPE_IMA, DIGEST_TYPE_VERITY, DIGEST_TYPE__LAST }; #define DIGEST_TYPE_NAME_LEN_MAX 7 /* including NUL */ static const char * const digest_type_name[DIGEST_TYPE__LAST] = { [DIGEST_TYPE_IMA] = "ima", [DIGEST_TYPE_VERITY] = "verity" }; static int ima_write_template_field_data(const void *data, const u32 datalen, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf, *buf_ptr; u32 buflen = datalen; if (datafmt == DATA_FMT_STRING) buflen = datalen + 1; buf = kzalloc(buflen, GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, data, datalen); /* * Replace all space characters with underscore for event names and * strings. This avoid that, during the parsing of a measurements list, * filenames with spaces or that end with the suffix ' (deleted)' are * split into multiple template fields (the space is the delimitator * character for measurements lists in ASCII format). */ if (datafmt == DATA_FMT_STRING) { for (buf_ptr = buf; buf_ptr - buf < datalen; buf_ptr++) if (*buf_ptr == ' ') *buf_ptr = '_'; } field_data->data = buf; field_data->len = buflen; return 0; } static void ima_show_template_data_ascii(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u8 *buf_ptr = field_data->data; u32 buflen = field_data->len; switch (datafmt) { case DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: case DATA_FMT_DIGEST_WITH_ALGO: buf_ptr = strrchr(field_data->data, ':'); if (buf_ptr != field_data->data) seq_printf(m, "%s", field_data->data); /* skip ':' and '\0' */ buf_ptr += 2; buflen -= buf_ptr - field_data->data; fallthrough; case DATA_FMT_DIGEST: case DATA_FMT_HEX: if (!buflen) break; ima_print_digest(m, buf_ptr, buflen); break; case DATA_FMT_STRING: seq_printf(m, "%s", buf_ptr); break; case DATA_FMT_UINT: switch (field_data->len) { case sizeof(u8): seq_printf(m, "%u", *(u8 *)buf_ptr); break; case sizeof(u16): if (ima_canonical_fmt) seq_printf(m, "%u", le16_to_cpu(*(__le16 *)buf_ptr)); else seq_printf(m, "%u", *(u16 *)buf_ptr); break; case sizeof(u32): if (ima_canonical_fmt) seq_printf(m, "%u", le32_to_cpu(*(__le32 *)buf_ptr)); else seq_printf(m, "%u", *(u32 *)buf_ptr); break; case sizeof(u64): if (ima_canonical_fmt) seq_printf(m, "%llu", le64_to_cpu(*(__le64 *)buf_ptr)); else seq_printf(m, "%llu", *(u64 *)buf_ptr); break; default: break; } break; default: break; } } static void ima_show_template_data_binary(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { u32 len = (show == IMA_SHOW_BINARY_OLD_STRING_FMT) ? strlen(field_data->data) : field_data->len; if (show != IMA_SHOW_BINARY_NO_FIELD_LEN) { u32 field_len = !ima_canonical_fmt ? len : (__force u32)cpu_to_le32(len); ima_putc(m, &field_len, sizeof(field_len)); } if (!len) return; ima_putc(m, field_data->data, len); } static void ima_show_template_field_data(struct seq_file *m, enum ima_show_type show, enum data_formats datafmt, struct ima_field_data *field_data) { switch (show) { case IMA_SHOW_ASCII: ima_show_template_data_ascii(m, show, datafmt, field_data); break; case IMA_SHOW_BINARY: case IMA_SHOW_BINARY_NO_FIELD_LEN: case IMA_SHOW_BINARY_OLD_STRING_FMT: ima_show_template_data_binary(m, show, datafmt, field_data); break; default: break; } } void ima_show_template_digest(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST, field_data); } void ima_show_template_digest_ng(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_ALGO, field_data); } void ima_show_template_digest_ngv2(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO, field_data); } void ima_show_template_string(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_STRING, field_data); } void ima_show_template_sig(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_buf(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_HEX, field_data); } void ima_show_template_uint(struct seq_file *m, enum ima_show_type show, struct ima_field_data *field_data) { ima_show_template_field_data(m, show, DATA_FMT_UINT, field_data); } /** * ima_parse_buf() - Parses lengths and data from an input buffer * @bufstartp: Buffer start address. * @bufendp: Buffer end address. * @bufcurp: Pointer to remaining (non-parsed) data. * @maxfields: Length of fields array. * @fields: Array containing lengths and pointers of parsed data. * @curfields: Number of array items containing parsed data. * @len_mask: Bitmap (if bit is set, data length should not be parsed). * @enforce_mask: Check if curfields == maxfields and/or bufcurp == bufendp. * @bufname: String identifier of the input buffer. * * Return: 0 on success, -EINVAL on error. */ int ima_parse_buf(void *bufstartp, void *bufendp, void **bufcurp, int maxfields, struct ima_field_data *fields, int *curfields, unsigned long *len_mask, int enforce_mask, char *bufname) { void *bufp = bufstartp; int i; for (i = 0; i < maxfields; i++) { if (len_mask == NULL || !test_bit(i, len_mask)) { if (bufp > (bufendp - sizeof(u32))) break; if (ima_canonical_fmt) fields[i].len = le32_to_cpu(*(__le32 *)bufp); else fields[i].len = *(u32 *)bufp; bufp += sizeof(u32); } if (bufp > (bufendp - fields[i].len)) break; fields[i].data = bufp; bufp += fields[i].len; } if ((enforce_mask & ENFORCE_FIELDS) && i != maxfields) { pr_err("%s: nr of fields mismatch: expected: %d, current: %d\n", bufname, maxfields, i); return -EINVAL; } if ((enforce_mask & ENFORCE_BUFEND) && bufp != bufendp) { pr_err("%s: buf end mismatch: expected: %p, current: %p\n", bufname, bufendp, bufp); return -EINVAL; } if (curfields) *curfields = i; if (bufcurp) *bufcurp = bufp; return 0; } static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, u8 digest_type, u8 hash_algo, struct ima_field_data *field_data) { /* * digest formats: * - DATA_FMT_DIGEST: digest * - DATA_FMT_DIGEST_WITH_ALGO: <hash algo> + ':' + '\0' + digest, * - DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO: * <digest type> + ':' + <hash algo> + ':' + '\0' + digest, * * where 'DATA_FMT_DIGEST' is the original digest format ('d') * with a hash size limitation of 20 bytes, * where <digest type> is either "ima" or "verity", * where <hash algo> is the hash_algo_name[] string. */ u8 buffer[DIGEST_TYPE_NAME_LEN_MAX + CRYPTO_MAX_ALG_NAME + 2 + IMA_MAX_DIGEST_SIZE] = { 0 }; enum data_formats fmt = DATA_FMT_DIGEST; u32 offset = 0; if (digest_type < DIGEST_TYPE__LAST && hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_TYPE_AND_ALGO; offset += 1 + sprintf(buffer, "%s:%s:", digest_type_name[digest_type], hash_algo_name[hash_algo]); } else if (hash_algo < HASH_ALGO__LAST) { fmt = DATA_FMT_DIGEST_WITH_ALGO; offset += 1 + sprintf(buffer, "%s:", hash_algo_name[hash_algo]); } if (digest) { memcpy(buffer + offset, digest, digestsize); } else { /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the * hash algorithm digest size. If the hash algorithm is not * specified increase the offset by IMA_DIGEST_SIZE which * fits SHA1 or MD5 */ if (hash_algo < HASH_ALGO__LAST) offset += hash_digest_size[hash_algo]; else offset += IMA_DIGEST_SIZE; } return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); } /* * This function writes the digest of an event (with size limit). */ int ima_eventdigest_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct ima_max_digest_data hash; struct ima_digest_data *hash_hdr = container_of(&hash.hdr, struct ima_digest_data, hdr); u8 *cur_digest = NULL; u32 cur_digestsize = 0; struct inode *inode; int result; memset(&hash, 0, sizeof(hash)); if (event_data->violation) /* recording a violation. */ goto out; if (ima_template_hash_algo_allowed(event_data->iint->ima_hash->algo)) { cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; goto out; } if ((const char *)event_data->filename == boot_aggregate_name) { if (ima_tpm_chip) { hash.hdr.algo = HASH_ALGO_SHA1; result = ima_calc_boot_aggregate(hash_hdr); /* algo can change depending on available PCR banks */ if (!result && hash.hdr.algo != HASH_ALGO_SHA1) result = -EINVAL; if (result < 0) memset(&hash, 0, sizeof(hash)); } cur_digest = hash_hdr->digest; cur_digestsize = hash_digest_size[HASH_ALGO_SHA1]; goto out; } if (!event_data->file) /* missing info to re-calculate the digest */ return -EINVAL; inode = file_inode(event_data->file); hash.hdr.algo = ima_template_hash_algo_allowed(ima_hash_algo) ? ima_hash_algo : HASH_ALGO_SHA1; result = ima_calc_file_hash(event_data->file, hash_hdr); if (result) { integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, event_data->filename, "collect_data", "failed", result, 0); return result; } cur_digest = hash_hdr->digest; cur_digestsize = hash.hdr.length; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, HASH_ALGO__LAST, field_data); } /* * This function writes the digest of an event (without size limit). */ int ima_eventdigest_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } /* * This function writes the digest of an event (without size limit), * prefixed with both the digest type and hash algorithm. */ int ima_eventdigest_ngv2_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { u8 *cur_digest = NULL, hash_algo = ima_hash_algo; u32 cur_digestsize = 0; u8 digest_type = DIGEST_TYPE_IMA; if (event_data->violation) /* recording a violation. */ goto out; cur_digest = event_data->iint->ima_hash->digest; cur_digestsize = event_data->iint->ima_hash->length; hash_algo = event_data->iint->ima_hash->algo; if (event_data->iint->flags & IMA_VERITY_REQUIRED) digest_type = DIGEST_TYPE_VERITY; out: return ima_eventdigest_init_common(cur_digest, cur_digestsize, digest_type, hash_algo, field_data); } /* * This function writes the digest of the file which is expected to match the * digest contained in the file's appended signature. */ int ima_eventdigest_modsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { enum hash_algo hash_algo; const u8 *cur_digest; u32 cur_digestsize; if (!event_data->modsig) return 0; if (event_data->violation) { /* Recording a violation. */ hash_algo = HASH_ALGO_SHA1; cur_digest = NULL; cur_digestsize = 0; } else { int rc; rc = ima_get_modsig_digest(event_data->modsig, &hash_algo, &cur_digest, &cur_digestsize); if (rc) return rc; else if (hash_algo == HASH_ALGO__LAST || cur_digestsize == 0) /* There was some error collecting the digest. */ return -EINVAL; } return ima_eventdigest_init_common(cur_digest, cur_digestsize, DIGEST_TYPE__LAST, hash_algo, field_data); } static int ima_eventname_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool size_limit) { const char *cur_filename = NULL; struct name_snapshot filename; u32 cur_filename_len = 0; bool snapshot = false; int ret; BUG_ON(event_data->filename == NULL && event_data->file == NULL); if (event_data->filename) { cur_filename = event_data->filename; cur_filename_len = strlen(event_data->filename); if (!size_limit || cur_filename_len <= IMA_EVENT_NAME_LEN_MAX) goto out; } if (event_data->file) { take_dentry_name_snapshot(&filename, event_data->file->f_path.dentry); snapshot = true; cur_filename = filename.name.name; cur_filename_len = strlen(cur_filename); } else /* * Truncate filename if the latter is too long and * the file descriptor is not available. */ cur_filename_len = IMA_EVENT_NAME_LEN_MAX; out: ret = ima_write_template_field_data(cur_filename, cur_filename_len, DATA_FMT_STRING, field_data); if (snapshot) release_dentry_name_snapshot(&filename); return ret; } /* * This function writes the name of an event (with size limit). */ int ima_eventname_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, true); } /* * This function writes the name of an event (without size limit). */ int ima_eventname_ng_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventname_init_common(event_data, field_data, false); } /* * ima_eventsig_init - include the file signature as part of the template data */ int ima_eventsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_value = event_data->xattr_value; if (!xattr_value || (xattr_value->type != EVM_IMA_XATTR_DIGSIG && xattr_value->type != IMA_VERITY_DIGSIG)) return ima_eventevmsig_init(event_data, field_data); return ima_write_template_field_data(xattr_value, event_data->xattr_len, DATA_FMT_HEX, field_data); } /* * ima_eventbuf_init - include the buffer(kexec-cmldine) as part of the * template data. */ int ima_eventbuf_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { if ((!event_data->buf) || (event_data->buf_len == 0)) return 0; return ima_write_template_field_data(event_data->buf, event_data->buf_len, DATA_FMT_HEX, field_data); } /* * ima_eventmodsig_init - include the appended file signature as part of the * template data */ int ima_eventmodsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { const void *data; u32 data_len; int rc; if (!event_data->modsig) return 0; /* * modsig is a runtime structure containing pointers. Get its raw data * instead. */ rc = ima_get_raw_modsig(event_data->modsig, &data, &data_len); if (rc) return rc; return ima_write_template_field_data(data, data_len, DATA_FMT_HEX, field_data); } /* * ima_eventevmsig_init - include the EVM portable signature as part of the * template data */ int ima_eventevmsig_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct evm_ima_xattr_data *xattr_data = NULL; int rc = 0; if (!event_data->file) return 0; rc = vfs_getxattr_alloc(&nop_mnt_idmap, file_dentry(event_data->file), XATTR_NAME_EVM, (char **)&xattr_data, 0, GFP_NOFS); if (rc <= 0 || xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)xattr_data, rc, DATA_FMT_HEX, field_data); out: kfree(xattr_data); return rc; } static int ima_eventinodedac_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, bool get_uid) { unsigned int id; if (!event_data->file) return 0; if (get_uid) id = i_uid_read(file_inode(event_data->file)); else id = i_gid_read(file_inode(event_data->file)); if (ima_canonical_fmt) { if (sizeof(id) == sizeof(u16)) id = (__force u16)cpu_to_le16(id); else id = (__force u32)cpu_to_le32(id); } return ima_write_template_field_data((void *)&id, sizeof(id), DATA_FMT_UINT, field_data); } /* * ima_eventinodeuid_init - include the inode UID as part of the template * data */ int ima_eventinodeuid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, true); } /* * ima_eventinodegid_init - include the inode GID as part of the template * data */ int ima_eventinodegid_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodedac_init_common(event_data, field_data, false); } /* * ima_eventinodemode_init - include the inode mode as part of the template * data */ int ima_eventinodemode_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { struct inode *inode; u16 mode; if (!event_data->file) return 0; inode = file_inode(event_data->file); mode = inode->i_mode; if (ima_canonical_fmt) mode = (__force u16)cpu_to_le16(mode); return ima_write_template_field_data((char *)&mode, sizeof(mode), DATA_FMT_UINT, field_data); } static int ima_eventinodexattrs_init_common(struct ima_event_data *event_data, struct ima_field_data *field_data, char type) { u8 *buffer = NULL; int rc; if (!event_data->file) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), NULL, 0, type, ima_canonical_fmt); if (rc < 0) return 0; buffer = kmalloc(rc, GFP_KERNEL); if (!buffer) return 0; rc = evm_read_protected_xattrs(file_dentry(event_data->file), buffer, rc, type, ima_canonical_fmt); if (rc < 0) { rc = 0; goto out; } rc = ima_write_template_field_data((char *)buffer, rc, DATA_FMT_HEX, field_data); out: kfree(buffer); return rc; } /* * ima_eventinodexattrnames_init - include a list of xattr names as part of the * template data */ int ima_eventinodexattrnames_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'n'); } /* * ima_eventinodexattrlengths_init - include a list of xattr lengths as part of * the template data */ int ima_eventinodexattrlengths_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'l'); } /* * ima_eventinodexattrvalues_init - include a list of xattr values as part of * the template data */ int ima_eventinodexattrvalues_init(struct ima_event_data *event_data, struct ima_field_data *field_data) { return ima_eventinodexattrs_init_common(event_data, field_data, 'v'); }
58 45 57 56 57 13 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2009-2010 Gustavo F. Padovan <gustavo@padovan.org> Copyright (C) 2010 Google Inc. Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __L2CAP_H #define __L2CAP_H #include <linux/unaligned.h> #include <linux/atomic.h> /* L2CAP defaults */ #define L2CAP_DEFAULT_MTU 672 #define L2CAP_DEFAULT_MIN_MTU 48 #define L2CAP_DEFAULT_FLUSH_TO 0xFFFF #define L2CAP_EFS_DEFAULT_FLUSH_TO 0xFFFFFFFF #define L2CAP_DEFAULT_TX_WINDOW 63 #define L2CAP_DEFAULT_EXT_WINDOW 0x3FFF #define L2CAP_DEFAULT_MAX_TX 3 #define L2CAP_DEFAULT_RETRANS_TO 2 /* seconds */ #define L2CAP_DEFAULT_MONITOR_TO 12 /* seconds */ #define L2CAP_DEFAULT_MAX_PDU_SIZE 1492 /* Sized for AMP packet */ #define L2CAP_DEFAULT_ACK_TO 200 #define L2CAP_DEFAULT_MAX_SDU_SIZE 0xFFFF #define L2CAP_DEFAULT_SDU_ITIME 0xFFFFFFFF #define L2CAP_DEFAULT_ACC_LAT 0xFFFFFFFF #define L2CAP_BREDR_MAX_PAYLOAD 1019 /* 3-DH5 packet */ #define L2CAP_LE_MIN_MTU 23 #define L2CAP_ECRED_CONN_SCID_MAX 5 #define L2CAP_DISC_TIMEOUT msecs_to_jiffies(100) #define L2CAP_DISC_REJ_TIMEOUT msecs_to_jiffies(5000) #define L2CAP_ENC_TIMEOUT msecs_to_jiffies(5000) #define L2CAP_CONN_TIMEOUT msecs_to_jiffies(40000) #define L2CAP_INFO_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) /* L2CAP socket address */ struct sockaddr_l2 { sa_family_t l2_family; __le16 l2_psm; bdaddr_t l2_bdaddr; __le16 l2_cid; __u8 l2_bdaddr_type; }; /* L2CAP socket options */ #define L2CAP_OPTIONS 0x01 struct l2cap_options { __u16 omtu; __u16 imtu; __u16 flush_to; __u8 mode; __u8 fcs; __u8 max_tx; __u16 txwin_size; }; #define L2CAP_CONNINFO 0x02 struct l2cap_conninfo { __u16 hci_handle; __u8 dev_class[3]; }; #define L2CAP_LM 0x03 #define L2CAP_LM_MASTER 0x0001 #define L2CAP_LM_AUTH 0x0002 #define L2CAP_LM_ENCRYPT 0x0004 #define L2CAP_LM_TRUSTED 0x0008 #define L2CAP_LM_RELIABLE 0x0010 #define L2CAP_LM_SECURE 0x0020 #define L2CAP_LM_FIPS 0x0040 /* L2CAP command codes */ #define L2CAP_COMMAND_REJ 0x01 #define L2CAP_CONN_REQ 0x02 #define L2CAP_CONN_RSP 0x03 #define L2CAP_CONF_REQ 0x04 #define L2CAP_CONF_RSP 0x05 #define L2CAP_DISCONN_REQ 0x06 #define L2CAP_DISCONN_RSP 0x07 #define L2CAP_ECHO_REQ 0x08 #define L2CAP_ECHO_RSP 0x09 #define L2CAP_INFO_REQ 0x0a #define L2CAP_INFO_RSP 0x0b #define L2CAP_CONN_PARAM_UPDATE_REQ 0x12 #define L2CAP_CONN_PARAM_UPDATE_RSP 0x13 #define L2CAP_LE_CONN_REQ 0x14 #define L2CAP_LE_CONN_RSP 0x15 #define L2CAP_LE_CREDITS 0x16 #define L2CAP_ECRED_CONN_REQ 0x17 #define L2CAP_ECRED_CONN_RSP 0x18 #define L2CAP_ECRED_RECONF_REQ 0x19 #define L2CAP_ECRED_RECONF_RSP 0x1a /* L2CAP extended feature mask */ #define L2CAP_FEAT_FLOWCTL 0x00000001 #define L2CAP_FEAT_RETRANS 0x00000002 #define L2CAP_FEAT_BIDIR_QOS 0x00000004 #define L2CAP_FEAT_ERTM 0x00000008 #define L2CAP_FEAT_STREAMING 0x00000010 #define L2CAP_FEAT_FCS 0x00000020 #define L2CAP_FEAT_EXT_FLOW 0x00000040 #define L2CAP_FEAT_FIXED_CHAN 0x00000080 #define L2CAP_FEAT_EXT_WINDOW 0x00000100 #define L2CAP_FEAT_UCD 0x00000200 /* L2CAP checksum option */ #define L2CAP_FCS_NONE 0x00 #define L2CAP_FCS_CRC16 0x01 /* L2CAP fixed channels */ #define L2CAP_FC_SIG_BREDR 0x02 #define L2CAP_FC_CONNLESS 0x04 #define L2CAP_FC_ATT 0x10 #define L2CAP_FC_SIG_LE 0x20 #define L2CAP_FC_SMP_LE 0x40 #define L2CAP_FC_SMP_BREDR 0x80 /* L2CAP Control Field bit masks */ #define L2CAP_CTRL_SAR 0xC000 #define L2CAP_CTRL_REQSEQ 0x3F00 #define L2CAP_CTRL_TXSEQ 0x007E #define L2CAP_CTRL_SUPERVISE 0x000C #define L2CAP_CTRL_RETRANS 0x0080 #define L2CAP_CTRL_FINAL 0x0080 #define L2CAP_CTRL_POLL 0x0010 #define L2CAP_CTRL_FRAME_TYPE 0x0001 /* I- or S-Frame */ #define L2CAP_CTRL_TXSEQ_SHIFT 1 #define L2CAP_CTRL_SUPER_SHIFT 2 #define L2CAP_CTRL_POLL_SHIFT 4 #define L2CAP_CTRL_FINAL_SHIFT 7 #define L2CAP_CTRL_REQSEQ_SHIFT 8 #define L2CAP_CTRL_SAR_SHIFT 14 /* L2CAP Extended Control Field bit mask */ #define L2CAP_EXT_CTRL_TXSEQ 0xFFFC0000 #define L2CAP_EXT_CTRL_SAR 0x00030000 #define L2CAP_EXT_CTRL_SUPERVISE 0x00030000 #define L2CAP_EXT_CTRL_REQSEQ 0x0000FFFC #define L2CAP_EXT_CTRL_POLL 0x00040000 #define L2CAP_EXT_CTRL_FINAL 0x00000002 #define L2CAP_EXT_CTRL_FRAME_TYPE 0x00000001 /* I- or S-Frame */ #define L2CAP_EXT_CTRL_FINAL_SHIFT 1 #define L2CAP_EXT_CTRL_REQSEQ_SHIFT 2 #define L2CAP_EXT_CTRL_SAR_SHIFT 16 #define L2CAP_EXT_CTRL_SUPER_SHIFT 16 #define L2CAP_EXT_CTRL_POLL_SHIFT 18 #define L2CAP_EXT_CTRL_TXSEQ_SHIFT 18 /* L2CAP Supervisory Function */ #define L2CAP_SUPER_RR 0x00 #define L2CAP_SUPER_REJ 0x01 #define L2CAP_SUPER_RNR 0x02 #define L2CAP_SUPER_SREJ 0x03 /* L2CAP Segmentation and Reassembly */ #define L2CAP_SAR_UNSEGMENTED 0x00 #define L2CAP_SAR_START 0x01 #define L2CAP_SAR_END 0x02 #define L2CAP_SAR_CONTINUE 0x03 /* L2CAP Command rej. reasons */ #define L2CAP_REJ_NOT_UNDERSTOOD 0x0000 #define L2CAP_REJ_MTU_EXCEEDED 0x0001 #define L2CAP_REJ_INVALID_CID 0x0002 /* L2CAP structures */ struct l2cap_hdr { __le16 len; __le16 cid; } __packed; #define L2CAP_LEN_SIZE 2 #define L2CAP_HDR_SIZE 4 #define L2CAP_ENH_HDR_SIZE 6 #define L2CAP_EXT_HDR_SIZE 8 #define L2CAP_FCS_SIZE 2 #define L2CAP_SDULEN_SIZE 2 #define L2CAP_PSMLEN_SIZE 2 #define L2CAP_ENH_CTRL_SIZE 2 #define L2CAP_EXT_CTRL_SIZE 4 struct l2cap_cmd_hdr { __u8 code; __u8 ident; __le16 len; } __packed; #define L2CAP_CMD_HDR_SIZE 4 struct l2cap_cmd_rej_unk { __le16 reason; } __packed; struct l2cap_cmd_rej_mtu { __le16 reason; __le16 max_mtu; } __packed; struct l2cap_cmd_rej_cid { __le16 reason; __le16 scid; __le16 dcid; } __packed; struct l2cap_conn_req { __le16 psm; __le16 scid; } __packed; struct l2cap_conn_rsp { __le16 dcid; __le16 scid; __le16 result; __le16 status; } __packed; /* protocol/service multiplexer (PSM) */ #define L2CAP_PSM_SDP 0x0001 #define L2CAP_PSM_RFCOMM 0x0003 #define L2CAP_PSM_3DSP 0x0021 #define L2CAP_PSM_IPSP 0x0023 /* 6LoWPAN */ #define L2CAP_PSM_DYN_START 0x1001 #define L2CAP_PSM_DYN_END 0xffff #define L2CAP_PSM_AUTO_END 0x10ff #define L2CAP_PSM_LE_DYN_START 0x0080 #define L2CAP_PSM_LE_DYN_END 0x00ff /* channel identifier */ #define L2CAP_CID_SIGNALING 0x0001 #define L2CAP_CID_CONN_LESS 0x0002 #define L2CAP_CID_ATT 0x0004 #define L2CAP_CID_LE_SIGNALING 0x0005 #define L2CAP_CID_SMP 0x0006 #define L2CAP_CID_SMP_BREDR 0x0007 #define L2CAP_CID_DYN_START 0x0040 #define L2CAP_CID_DYN_END 0xffff #define L2CAP_CID_LE_DYN_END 0x007f /* connect/create channel results */ #define L2CAP_CR_SUCCESS 0x0000 #define L2CAP_CR_PEND 0x0001 #define L2CAP_CR_BAD_PSM 0x0002 #define L2CAP_CR_SEC_BLOCK 0x0003 #define L2CAP_CR_NO_MEM 0x0004 #define L2CAP_CR_INVALID_SCID 0x0006 #define L2CAP_CR_SCID_IN_USE 0x0007 /* credit based connect results */ #define L2CAP_CR_LE_SUCCESS 0x0000 #define L2CAP_CR_LE_BAD_PSM 0x0002 #define L2CAP_CR_LE_NO_MEM 0x0004 #define L2CAP_CR_LE_AUTHENTICATION 0x0005 #define L2CAP_CR_LE_AUTHORIZATION 0x0006 #define L2CAP_CR_LE_BAD_KEY_SIZE 0x0007 #define L2CAP_CR_LE_ENCRYPTION 0x0008 #define L2CAP_CR_LE_INVALID_SCID 0x0009 #define L2CAP_CR_LE_SCID_IN_USE 0X000A #define L2CAP_CR_LE_UNACCEPT_PARAMS 0X000B #define L2CAP_CR_LE_INVALID_PARAMS 0X000C /* connect/create channel status */ #define L2CAP_CS_NO_INFO 0x0000 #define L2CAP_CS_AUTHEN_PEND 0x0001 #define L2CAP_CS_AUTHOR_PEND 0x0002 struct l2cap_conf_req { __le16 dcid; __le16 flags; __u8 data[]; } __packed; struct l2cap_conf_rsp { __le16 scid; __le16 flags; __le16 result; __u8 data[]; } __packed; #define L2CAP_CONF_SUCCESS 0x0000 #define L2CAP_CONF_UNACCEPT 0x0001 #define L2CAP_CONF_REJECT 0x0002 #define L2CAP_CONF_UNKNOWN 0x0003 #define L2CAP_CONF_PENDING 0x0004 #define L2CAP_CONF_EFS_REJECT 0x0005 /* configuration req/rsp continuation flag */ #define L2CAP_CONF_FLAG_CONTINUATION 0x0001 struct l2cap_conf_opt { __u8 type; __u8 len; __u8 val[]; } __packed; #define L2CAP_CONF_OPT_SIZE 2 #define L2CAP_CONF_HINT 0x80 #define L2CAP_CONF_MASK 0x7f #define L2CAP_CONF_MTU 0x01 #define L2CAP_CONF_FLUSH_TO 0x02 #define L2CAP_CONF_QOS 0x03 #define L2CAP_CONF_RFC 0x04 #define L2CAP_CONF_FCS 0x05 #define L2CAP_CONF_EFS 0x06 #define L2CAP_CONF_EWS 0x07 #define L2CAP_CONF_MAX_SIZE 22 struct l2cap_conf_rfc { __u8 mode; __u8 txwin_size; __u8 max_transmit; __le16 retrans_timeout; __le16 monitor_timeout; __le16 max_pdu_size; } __packed; #define L2CAP_MODE_BASIC 0x00 #define L2CAP_MODE_RETRANS 0x01 #define L2CAP_MODE_FLOWCTL 0x02 #define L2CAP_MODE_ERTM 0x03 #define L2CAP_MODE_STREAMING 0x04 /* Unlike the above this one doesn't actually map to anything that would * ever be sent over the air. Therefore, use a value that's unlikely to * ever be used in the BR/EDR configuration phase. */ #define L2CAP_MODE_LE_FLOWCTL 0x80 #define L2CAP_MODE_EXT_FLOWCTL 0x81 struct l2cap_conf_efs { __u8 id; __u8 stype; __le16 msdu; __le32 sdu_itime; __le32 acc_lat; __le32 flush_to; } __packed; #define L2CAP_SERV_NOTRAFIC 0x00 #define L2CAP_SERV_BESTEFFORT 0x01 #define L2CAP_SERV_GUARANTEED 0x02 #define L2CAP_BESTEFFORT_ID 0x01 struct l2cap_disconn_req { __le16 dcid; __le16 scid; } __packed; struct l2cap_disconn_rsp { __le16 dcid; __le16 scid; } __packed; struct l2cap_info_req { __le16 type; } __packed; struct l2cap_info_rsp { __le16 type; __le16 result; __u8 data[]; } __packed; #define L2CAP_MR_SUCCESS 0x0000 #define L2CAP_MR_PEND 0x0001 #define L2CAP_MR_BAD_ID 0x0002 #define L2CAP_MR_SAME_ID 0x0003 #define L2CAP_MR_NOT_SUPP 0x0004 #define L2CAP_MR_COLLISION 0x0005 #define L2CAP_MR_NOT_ALLOWED 0x0006 struct l2cap_move_chan_cfm { __le16 icid; __le16 result; } __packed; #define L2CAP_MC_CONFIRMED 0x0000 #define L2CAP_MC_UNCONFIRMED 0x0001 struct l2cap_move_chan_cfm_rsp { __le16 icid; } __packed; /* info type */ #define L2CAP_IT_CL_MTU 0x0001 #define L2CAP_IT_FEAT_MASK 0x0002 #define L2CAP_IT_FIXED_CHAN 0x0003 /* info result */ #define L2CAP_IR_SUCCESS 0x0000 #define L2CAP_IR_NOTSUPP 0x0001 struct l2cap_conn_param_update_req { __le16 min; __le16 max; __le16 latency; __le16 to_multiplier; } __packed; struct l2cap_conn_param_update_rsp { __le16 result; } __packed; /* Connection Parameters result */ #define L2CAP_CONN_PARAM_ACCEPTED 0x0000 #define L2CAP_CONN_PARAM_REJECTED 0x0001 struct l2cap_le_conn_req { __le16 psm; __le16 scid; __le16 mtu; __le16 mps; __le16 credits; } __packed; struct l2cap_le_conn_rsp { __le16 dcid; __le16 mtu; __le16 mps; __le16 credits; __le16 result; } __packed; struct l2cap_le_credits { __le16 cid; __le16 credits; } __packed; #define L2CAP_ECRED_MIN_MTU 64 #define L2CAP_ECRED_MIN_MPS 64 #define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { /* New members must be added within the struct_group() macro below. */ __struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed, __le16 psm; __le16 mtu; __le16 mps; __le16 credits; ); __le16 scid[]; } __packed; struct l2cap_ecred_conn_rsp { /* New members must be added within the struct_group() macro below. */ struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr, __le16 mtu; __le16 mps; __le16 credits; __le16 result; ); __le16 dcid[]; }; struct l2cap_ecred_reconf_req { __le16 mtu; __le16 mps; __le16 scid[]; } __packed; #define L2CAP_RECONF_SUCCESS 0x0000 #define L2CAP_RECONF_INVALID_MTU 0x0001 #define L2CAP_RECONF_INVALID_MPS 0x0002 struct l2cap_ecred_reconf_rsp { __le16 result; } __packed; /* ----- L2CAP channels and connections ----- */ struct l2cap_seq_list { __u16 head; __u16 tail; __u16 mask; __u16 *list; }; #define L2CAP_SEQ_LIST_CLEAR 0xFFFF #define L2CAP_SEQ_LIST_TAIL 0x8000 struct l2cap_chan { struct l2cap_conn *conn; struct kref kref; atomic_t nesting; __u8 state; bdaddr_t dst; __u8 dst_type; bdaddr_t src; __u8 src_type; __le16 psm; __le16 sport; __u16 dcid; __u16 scid; __u16 imtu; __u16 omtu; __u16 flush_to; __u8 mode; __u8 chan_type; __u8 chan_policy; __u8 sec_level; __u8 ident; __u8 conf_req[64]; __u8 conf_len; __u8 num_conf_req; __u8 num_conf_rsp; __u8 fcs; __u16 tx_win; __u16 tx_win_max; __u16 ack_win; __u8 max_tx; __u16 retrans_timeout; __u16 monitor_timeout; __u16 mps; __u16 tx_credits; __u16 rx_credits; /* estimated available receive buffer space or -1 if unknown */ ssize_t rx_avail; __u8 tx_state; __u8 rx_state; unsigned long conf_state; unsigned long conn_state; unsigned long flags; __u16 next_tx_seq; __u16 expected_ack_seq; __u16 expected_tx_seq; __u16 buffer_seq; __u16 srej_save_reqseq; __u16 last_acked_seq; __u16 frames_sent; __u16 unacked_frames; __u8 retry_count; __u16 sdu_len; struct sk_buff *sdu; struct sk_buff *sdu_last_frag; __u16 remote_tx_win; __u8 remote_max_tx; __u16 remote_mps; __u8 local_id; __u8 local_stype; __u16 local_msdu; __u32 local_sdu_itime; __u32 local_acc_lat; __u32 local_flush_to; __u8 remote_id; __u8 remote_stype; __u16 remote_msdu; __u32 remote_sdu_itime; __u32 remote_acc_lat; __u32 remote_flush_to; struct delayed_work chan_timer; struct delayed_work retrans_timer; struct delayed_work monitor_timer; struct delayed_work ack_timer; struct sk_buff *tx_send_head; struct sk_buff_head tx_q; struct sk_buff_head srej_q; struct l2cap_seq_list srej_list; struct l2cap_seq_list retrans_list; struct list_head list; struct list_head global_l; void *data; const struct l2cap_ops *ops; struct mutex lock; }; struct l2cap_ops { char *name; struct l2cap_chan *(*new_connection) (struct l2cap_chan *chan); int (*recv) (struct l2cap_chan * chan, struct sk_buff *skb); void (*teardown) (struct l2cap_chan *chan, int err); void (*close) (struct l2cap_chan *chan); void (*state_change) (struct l2cap_chan *chan, int state, int err); void (*ready) (struct l2cap_chan *chan); void (*defer) (struct l2cap_chan *chan); void (*resume) (struct l2cap_chan *chan); void (*suspend) (struct l2cap_chan *chan); void (*set_shutdown) (struct l2cap_chan *chan); long (*get_sndtimeo) (struct l2cap_chan *chan); struct pid *(*get_peer_pid) (struct l2cap_chan *chan); struct sk_buff *(*alloc_skb) (struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb); int (*filter) (struct l2cap_chan * chan, struct sk_buff *skb); }; struct l2cap_conn { struct hci_conn *hcon; struct hci_chan *hchan; unsigned int mtu; __u32 feat_mask; __u8 remote_fixed_chan; __u8 local_fixed_chan; __u8 info_state; __u8 info_ident; struct delayed_work info_timer; struct sk_buff *rx_skb; __u32 rx_len; __u8 tx_ident; struct mutex ident_lock; struct sk_buff_head pending_rx; struct work_struct pending_rx_work; struct delayed_work id_addr_timer; __u8 disc_reason; struct l2cap_chan *smp; struct list_head chan_l; struct mutex lock; struct kref ref; struct list_head users; }; struct l2cap_user { struct list_head list; int (*probe) (struct l2cap_conn *conn, struct l2cap_user *user); void (*remove) (struct l2cap_conn *conn, struct l2cap_user *user); }; #define L2CAP_INFO_CL_MTU_REQ_SENT 0x01 #define L2CAP_INFO_FEAT_MASK_REQ_SENT 0x04 #define L2CAP_INFO_FEAT_MASK_REQ_DONE 0x08 #define L2CAP_CHAN_RAW 1 #define L2CAP_CHAN_CONN_LESS 2 #define L2CAP_CHAN_CONN_ORIENTED 3 #define L2CAP_CHAN_FIXED 4 /* ----- L2CAP socket info ----- */ #define l2cap_pi(sk) ((struct l2cap_pinfo *) sk) struct l2cap_rx_busy { struct list_head list; struct sk_buff *skb; }; struct l2cap_pinfo { struct bt_sock bt; struct l2cap_chan *chan; struct list_head rx_busy; }; enum { CONF_REQ_SENT, CONF_INPUT_DONE, CONF_OUTPUT_DONE, CONF_MTU_DONE, CONF_MODE_DONE, CONF_CONNECT_PEND, CONF_RECV_NO_FCS, CONF_STATE2_DEVICE, CONF_EWS_RECV, CONF_LOC_CONF_PEND, CONF_REM_CONF_PEND, CONF_NOT_COMPLETE, }; #define L2CAP_CONF_MAX_CONF_REQ 2 #define L2CAP_CONF_MAX_CONF_RSP 2 enum { CONN_SREJ_SENT, CONN_WAIT_F, CONN_SREJ_ACT, CONN_SEND_PBIT, CONN_REMOTE_BUSY, CONN_LOCAL_BUSY, CONN_REJ_ACT, CONN_SEND_FBIT, CONN_RNR_SENT, }; /* Definitions for flags in l2cap_chan */ enum { FLAG_ROLE_SWITCH, FLAG_FORCE_ACTIVE, FLAG_FORCE_RELIABLE, FLAG_FLUSHABLE, FLAG_EXT_CTRL, FLAG_EFS_ENABLE, FLAG_DEFER_SETUP, FLAG_LE_CONN_REQ_SENT, FLAG_ECRED_CONN_REQ_SENT, FLAG_PENDING_SECURITY, FLAG_HOLD_HCI_CONN, }; /* Lock nesting levels for L2CAP channels. We need these because lockdep * otherwise considers all channels equal and will e.g. complain about a * connection oriented channel triggering SMP procedures or a listening * channel creating and locking a child channel. */ enum { L2CAP_NESTING_SMP, L2CAP_NESTING_NORMAL, L2CAP_NESTING_PARENT, }; enum { L2CAP_TX_STATE_XMIT, L2CAP_TX_STATE_WAIT_F, }; enum { L2CAP_RX_STATE_RECV, L2CAP_RX_STATE_SREJ_SENT, L2CAP_RX_STATE_MOVE, L2CAP_RX_STATE_WAIT_P, L2CAP_RX_STATE_WAIT_F, }; enum { L2CAP_TXSEQ_EXPECTED, L2CAP_TXSEQ_EXPECTED_SREJ, L2CAP_TXSEQ_UNEXPECTED, L2CAP_TXSEQ_UNEXPECTED_SREJ, L2CAP_TXSEQ_DUPLICATE, L2CAP_TXSEQ_DUPLICATE_SREJ, L2CAP_TXSEQ_INVALID, L2CAP_TXSEQ_INVALID_IGNORE, }; enum { L2CAP_EV_DATA_REQUEST, L2CAP_EV_LOCAL_BUSY_DETECTED, L2CAP_EV_LOCAL_BUSY_CLEAR, L2CAP_EV_RECV_REQSEQ_AND_FBIT, L2CAP_EV_RECV_FBIT, L2CAP_EV_RETRANS_TO, L2CAP_EV_MONITOR_TO, L2CAP_EV_EXPLICIT_POLL, L2CAP_EV_RECV_IFRAME, L2CAP_EV_RECV_RR, L2CAP_EV_RECV_REJ, L2CAP_EV_RECV_RNR, L2CAP_EV_RECV_SREJ, L2CAP_EV_RECV_FRAME, }; enum { L2CAP_MOVE_ROLE_NONE, L2CAP_MOVE_ROLE_INITIATOR, L2CAP_MOVE_ROLE_RESPONDER, }; enum { L2CAP_MOVE_STABLE, L2CAP_MOVE_WAIT_REQ, L2CAP_MOVE_WAIT_RSP, L2CAP_MOVE_WAIT_RSP_SUCCESS, L2CAP_MOVE_WAIT_CONFIRM, L2CAP_MOVE_WAIT_CONFIRM_RSP, L2CAP_MOVE_WAIT_LOGICAL_COMP, L2CAP_MOVE_WAIT_LOGICAL_CFM, L2CAP_MOVE_WAIT_LOCAL_BUSY, L2CAP_MOVE_WAIT_PREPARE, }; void l2cap_chan_hold(struct l2cap_chan *c); struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c); void l2cap_chan_put(struct l2cap_chan *c); static inline void l2cap_chan_lock(struct l2cap_chan *chan) { mutex_lock_nested(&chan->lock, atomic_read(&chan->nesting)); } static inline void l2cap_chan_unlock(struct l2cap_chan *chan) { mutex_unlock(&chan->lock); } static inline void l2cap_set_timer(struct l2cap_chan *chan, struct delayed_work *work, long timeout) { BT_DBG("chan %p state %s timeout %ld", chan, state_to_string(chan->state), timeout); /* If delayed work cancelled do not hold(chan) since it is already done with previous set_timer */ if (!cancel_delayed_work(work)) l2cap_chan_hold(chan); schedule_delayed_work(work, timeout); } static inline bool l2cap_clear_timer(struct l2cap_chan *chan, struct delayed_work *work) { bool ret; /* put(chan) if delayed work cancelled otherwise it is done in delayed work function */ ret = cancel_delayed_work(work); if (ret) l2cap_chan_put(chan); return ret; } #define __set_chan_timer(c, t) l2cap_set_timer(c, &c->chan_timer, (t)) #define __clear_chan_timer(c) l2cap_clear_timer(c, &c->chan_timer) #define __clear_retrans_timer(c) l2cap_clear_timer(c, &c->retrans_timer) #define __clear_monitor_timer(c) l2cap_clear_timer(c, &c->monitor_timer) #define __set_ack_timer(c) l2cap_set_timer(c, &chan->ack_timer, \ msecs_to_jiffies(L2CAP_DEFAULT_ACK_TO)); #define __clear_ack_timer(c) l2cap_clear_timer(c, &c->ack_timer) static inline int __seq_offset(struct l2cap_chan *chan, __u16 seq1, __u16 seq2) { if (seq1 >= seq2) return seq1 - seq2; else return chan->tx_win_max + 1 - seq2 + seq1; } static inline __u16 __next_seq(struct l2cap_chan *chan, __u16 seq) { return (seq + 1) % (chan->tx_win_max + 1); } static inline struct l2cap_chan *l2cap_chan_no_new_connection(struct l2cap_chan *chan) { return NULL; } static inline int l2cap_chan_no_recv(struct l2cap_chan *chan, struct sk_buff *skb) { return -ENOSYS; } static inline struct sk_buff *l2cap_chan_no_alloc_skb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { return ERR_PTR(-ENOSYS); } static inline void l2cap_chan_no_teardown(struct l2cap_chan *chan, int err) { } static inline void l2cap_chan_no_close(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_ready(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_state_change(struct l2cap_chan *chan, int state, int err) { } static inline void l2cap_chan_no_defer(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_suspend(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_resume(struct l2cap_chan *chan) { } static inline void l2cap_chan_no_set_shutdown(struct l2cap_chan *chan) { } static inline long l2cap_chan_no_get_sndtimeo(struct l2cap_chan *chan) { return 0; } extern bool disable_ertm; extern bool enable_ecred; int l2cap_init_sockets(void); void l2cap_cleanup_sockets(void); bool l2cap_is_socket(struct socket *sock); void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan); void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan); void __l2cap_connect_rsp_defer(struct l2cap_chan *chan); int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm); int l2cap_add_scid(struct l2cap_chan *chan, __u16 scid); struct l2cap_chan *l2cap_chan_create(void); void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, const struct sockcm_cookie *sockc); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); void l2cap_chan_set_defaults(struct l2cap_chan *chan); int l2cap_ertm_init(struct l2cap_chan *chan); void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan); typedef void (*l2cap_chan_func_t)(struct l2cap_chan *chan, void *data); void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, void *data); void l2cap_chan_del(struct l2cap_chan *chan, int err); void l2cap_send_conn_req(struct l2cap_chan *chan); struct l2cap_conn *l2cap_conn_get(struct l2cap_conn *conn); struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *conn); void l2cap_conn_put(struct l2cap_conn *conn); int l2cap_register_user(struct l2cap_conn *conn, struct l2cap_user *user); void l2cap_unregister_user(struct l2cap_conn *conn, struct l2cap_user *user); #endif /* __L2CAP_H */
2 4 4 1 3 3 5 5 1 4 4 3 3 2 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia * * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> */ #include <crypto/algapi.h> #include <crypto/internal/simd.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/module.h> #include <linux/types.h> #include "camellia.h" #include "ecb_cbc_helpers.h" #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 /* 16-way parallel cipher functions (avx/aes-ni) */ asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src); EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen); } static int ecb_encrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_enc_16way); ECB_BLOCK(2, camellia_enc_blk_2way); ECB_BLOCK(1, camellia_enc_blk); ECB_WALK_END(); } static int ecb_decrypt(struct skcipher_request *req) { ECB_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); ECB_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_ecb_dec_16way); ECB_BLOCK(2, camellia_dec_blk_2way); ECB_BLOCK(1, camellia_dec_blk); ECB_WALK_END(); } static int cbc_encrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, -1); CBC_ENC_BLOCK(camellia_enc_blk); CBC_WALK_END(); } static int cbc_decrypt(struct skcipher_request *req) { CBC_WALK_START(req, CAMELLIA_BLOCK_SIZE, CAMELLIA_AESNI_PARALLEL_BLOCKS); CBC_DEC_BLOCK(CAMELLIA_AESNI_PARALLEL_BLOCKS, camellia_cbc_dec_16way); CBC_DEC_BLOCK(2, camellia_decrypt_cbc_2way); CBC_DEC_BLOCK(1, camellia_dec_blk); CBC_WALK_END(); } static struct skcipher_alg camellia_algs[] = { { .base.cra_name = "__ecb(camellia)", .base.cra_driver_name = "__ecb-camellia-aesni", .base.cra_priority = 400, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAMELLIA_MIN_KEY_SIZE, .max_keysize = CAMELLIA_MAX_KEY_SIZE, .setkey = camellia_setkey, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "__cbc(camellia)", .base.cra_driver_name = "__cbc-camellia-aesni", .base.cra_priority = 400, .base.cra_flags = CRYPTO_ALG_INTERNAL, .base.cra_blocksize = CAMELLIA_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct camellia_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CAMELLIA_MIN_KEY_SIZE, .max_keysize = CAMELLIA_MAX_KEY_SIZE, .ivsize = CAMELLIA_BLOCK_SIZE, .setkey = camellia_setkey, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)]; static int __init camellia_aesni_init(void) { const char *feature_name; if (!boot_cpu_has(X86_FEATURE_AVX) || !boot_cpu_has(X86_FEATURE_AES) || !boot_cpu_has(X86_FEATURE_OSXSAVE)) { pr_info("AVX or AES-NI instructions are not detected.\n"); return -ENODEV; } if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, &feature_name)) { pr_info("CPU feature '%s' is not supported.\n", feature_name); return -ENODEV; } return simd_register_skciphers_compat(camellia_algs, ARRAY_SIZE(camellia_algs), camellia_simd_algs); } static void __exit camellia_aesni_fini(void) { simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs), camellia_simd_algs); } module_init(camellia_aesni_init); module_exit(camellia_aesni_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); MODULE_ALIAS_CRYPTO("camellia"); MODULE_ALIAS_CRYPTO("camellia-asm");
761 1283 761 1282 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, }; static unsigned int ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; const struct iphdr *iph; __be32 saddr, daddr; u32 mark; int err; u8 tos; /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = ipt_do_table(priv, skb, state); verdict = ret & NF_VERDICT_MASK; /* Reroute for ANY change. */ if (verdict != NF_DROP && verdict != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(priv, skb, state); return ipt_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; static int __init iptable_mangle_init(void) { int ret = xt_register_template(&packet_mangler, iptable_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(iptable_mangle_init); module_exit(iptable_mangle_fini);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These are the definitions needed for the tsnmap type. The tsnmap is used * to track out of order TSNs received. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <net/sctp/constants.h> #ifndef __sctp_tsnmap_h__ #define __sctp_tsnmap_h__ /* RFC 2960 12.2 Parameters necessary per association (i.e. the TCB) * Mapping An array of bits or bytes indicating which out of * Array order TSN's have been received (relative to the * Last Rcvd TSN). If no gaps exist, i.e. no out of * order packets have been received, this array * will be set to all zero. This structure may be * in the form of a circular buffer or bit array. */ struct sctp_tsnmap { /* This array counts the number of chunks with each TSN. * It points at one of the two buffers with which we will * ping-pong between. */ unsigned long *tsn_map; /* This is the TSN at tsn_map[0]. */ __u32 base_tsn; /* Last Rcvd : This is the last TSN received in * TSN : sequence. This value is set initially by * : taking the peer's Initial TSN, received in * : the INIT or INIT ACK chunk, and subtracting * : one from it. * * Throughout most of the specification this is called the * "Cumulative TSN ACK Point". In this case, we * ignore the advice in 12.2 in favour of the term * used in the bulk of the text. */ __u32 cumulative_tsn_ack_point; /* This is the highest TSN we've marked. */ __u32 max_tsn_seen; /* This is the minimum number of TSNs we can track. This corresponds * to the size of tsn_map. Note: the overflow_map allows us to * potentially track more than this quantity. */ __u16 len; /* Data chunks pending receipt. used by SCTP_STATUS sockopt */ __u16 pending_data; /* Record duplicate TSNs here. We clear this after * every SACK. Store up to SCTP_MAX_DUP_TSNS worth of * information. */ __u16 num_dup_tsns; __be32 dup_tsns[SCTP_MAX_DUP_TSNS]; }; struct sctp_tsnmap_iter { __u32 start; }; /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *, __u16 len, __u32 initial_tsn, gfp_t gfp); void sctp_tsnmap_free(struct sctp_tsnmap *map); /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *, __u32 tsn); /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *, __u32 tsn, struct sctp_transport *trans); /* Mark this TSN and all lower as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn); /* Retrieve the Cumulative TSN ACK Point. */ static inline __u32 sctp_tsnmap_get_ctsn(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point; } /* Retrieve the highest TSN we've seen. */ static inline __u32 sctp_tsnmap_get_max_tsn_seen(const struct sctp_tsnmap *map) { return map->max_tsn_seen; } /* How many duplicate TSNs are stored? */ static inline __u16 sctp_tsnmap_num_dups(struct sctp_tsnmap *map) { return map->num_dup_tsns; } /* Return pointer to duplicate tsn array as needed by SACK. */ static inline __be32 *sctp_tsnmap_get_dups(struct sctp_tsnmap *map) { map->num_dup_tsns = 0; return map->dup_tsns; } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs); /* Refresh the count on pending data. */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map); /* Is there a gap in the TSN map? */ static inline int sctp_tsnmap_has_gap(const struct sctp_tsnmap *map) { return map->cumulative_tsn_ack_point != map->max_tsn_seen; } /* Mark a duplicate TSN. Note: limit the storage of duplicate TSN * information. */ static inline void sctp_tsnmap_mark_dup(struct sctp_tsnmap *map, __u32 tsn) { if (map->num_dup_tsns < SCTP_MAX_DUP_TSNS) map->dup_tsns[map->num_dup_tsns++] = htonl(tsn); } /* Renege a TSN that was seen. */ void sctp_tsnmap_renege(struct sctp_tsnmap *, __u32 tsn); /* Is there a gap in the TSN map? */ int sctp_tsnmap_has_gap(const struct sctp_tsnmap *); #endif /* __sctp_tsnmap_h__ */
622 616 622 628 10 622 616 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 /* * kmod - the kernel module loader * * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <trace/events/module.h> #include "internal.h" /* * Assuming: * * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, * (u64) THREAD_SIZE * 8UL); * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, * and this would only be an upper limit, after which the OOM killer would take * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT); /* * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads * running at the same time without returning. When this happens we * believe you've somehow ended up with a recursive module dependency * creating a loop. * * We have no option but to fail. * * Userspace should proactively try to detect and prevent these. */ #define MAX_KMOD_ALL_BUSY_TIMEOUT 5 /* modprobe_path is set via /proc/sys. */ char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { kfree(info->argv[3]); /* check call_modprobe() */ kfree(info->argv); } static int call_modprobe(char *orig_module_name, int wait) { struct subprocess_info *info; static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; char *module_name; int ret; char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); if (!argv) goto out; module_name = kstrdup(orig_module_name, GFP_KERNEL); if (!module_name) goto free_argv; argv[0] = modprobe_path; argv[1] = "-q"; argv[2] = "--"; argv[3] = module_name; /* check free_modprobe_argv() */ argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, NULL, free_modprobe_argv, NULL); if (!info) goto free_module_name; ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE); kmod_dup_request_announce(orig_module_name, ret); return ret; free_module_name: kfree(module_name); free_argv: kfree(argv); out: kmod_dup_request_announce(orig_module_name, -ENOMEM); return -ENOMEM; } /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete * @fmt: printf style format string for the name of the module * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code or positive exit code from * "modprobe" on failure. Note that a successful module load does not mean * the module did not then unload and exit on an error of its own. Callers * must check that the service they requested is now available not blindly * invoke it. * * If module auto-loading support is disabled then this function * simply returns -ENOENT. */ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; int ret, dup_ret; /* * We don't allow synchronous module loading from async. Module * init may invoke async_synchronize_full() which will end up * waiting for this task which already is waiting for the module * loading to complete, leading to a deadlock. */ WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0]) return -ENOENT; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; ret = security_kernel_module_request(module_name); if (ret) return ret; ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); if (ret) { pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); return ret; } trace_module_request(module_name, wait, _RET_IP_); if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) { ret = dup_ret; goto out; } ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); out: up(&kmod_concurrent_max); return ret; } EXPORT_SYMBOL(__request_module);
6 2 3 3 6 3 4 6 12 11 6 7 5 4 7 8 6 5 9 4 5 9 6 3 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 /* * xxHash - Extremely Fast Hash algorithm * Copyright (C) 2012-2016, Yann Collet. * * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This program is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License version 2 as published by the * Free Software Foundation. This program is dual-licensed; you may select * either version 2 of the GNU General Public License ("GPL") or BSD license * ("BSD"). * * You can contact the author at: * - xxHash homepage: https://cyan4973.github.io/xxHash/ * - xxHash source repository: https://github.com/Cyan4973/xxHash */ #include <linux/unaligned.h> #include <linux/errno.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/xxhash.h> /*-************************************* * Macros **************************************/ #define xxh_rotl32(x, r) ((x << r) | (x >> (32 - r))) #define xxh_rotl64(x, r) ((x << r) | (x >> (64 - r))) #ifdef __LITTLE_ENDIAN # define XXH_CPU_LITTLE_ENDIAN 1 #else # define XXH_CPU_LITTLE_ENDIAN 0 #endif /*-************************************* * Constants **************************************/ static const uint32_t PRIME32_1 = 2654435761U; static const uint32_t PRIME32_2 = 2246822519U; static const uint32_t PRIME32_3 = 3266489917U; static const uint32_t PRIME32_4 = 668265263U; static const uint32_t PRIME32_5 = 374761393U; static const uint64_t PRIME64_1 = 11400714785074694791ULL; static const uint64_t PRIME64_2 = 14029467366897019727ULL; static const uint64_t PRIME64_3 = 1609587929392839161ULL; static const uint64_t PRIME64_4 = 9650029242287828579ULL; static const uint64_t PRIME64_5 = 2870177450012600261ULL; /*-************************** * Utils ***************************/ void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src) { memcpy(dst, src, sizeof(*dst)); } EXPORT_SYMBOL(xxh32_copy_state); void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src) { memcpy(dst, src, sizeof(*dst)); } EXPORT_SYMBOL(xxh64_copy_state); /*-*************************** * Simple Hash Functions ****************************/ static uint32_t xxh32_round(uint32_t seed, const uint32_t input) { seed += input * PRIME32_2; seed = xxh_rotl32(seed, 13); seed *= PRIME32_1; return seed; } uint32_t xxh32(const void *input, const size_t len, const uint32_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *b_end = p + len; uint32_t h32; if (len >= 16) { const uint8_t *const limit = b_end - 16; uint32_t v1 = seed + PRIME32_1 + PRIME32_2; uint32_t v2 = seed + PRIME32_2; uint32_t v3 = seed + 0; uint32_t v4 = seed - PRIME32_1; do { v1 = xxh32_round(v1, get_unaligned_le32(p)); p += 4; v2 = xxh32_round(v2, get_unaligned_le32(p)); p += 4; v3 = xxh32_round(v3, get_unaligned_le32(p)); p += 4; v4 = xxh32_round(v4, get_unaligned_le32(p)); p += 4; } while (p <= limit); h32 = xxh_rotl32(v1, 1) + xxh_rotl32(v2, 7) + xxh_rotl32(v3, 12) + xxh_rotl32(v4, 18); } else { h32 = seed + PRIME32_5; } h32 += (uint32_t)len; while (p + 4 <= b_end) { h32 += get_unaligned_le32(p) * PRIME32_3; h32 = xxh_rotl32(h32, 17) * PRIME32_4; p += 4; } while (p < b_end) { h32 += (*p) * PRIME32_5; h32 = xxh_rotl32(h32, 11) * PRIME32_1; p++; } h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } EXPORT_SYMBOL(xxh32); static uint64_t xxh64_round(uint64_t acc, const uint64_t input) { acc += input * PRIME64_2; acc = xxh_rotl64(acc, 31); acc *= PRIME64_1; return acc; } static uint64_t xxh64_merge_round(uint64_t acc, uint64_t val) { val = xxh64_round(0, val); acc ^= val; acc = acc * PRIME64_1 + PRIME64_4; return acc; } uint64_t xxh64(const void *input, const size_t len, const uint64_t seed) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; uint64_t h64; if (len >= 32) { const uint8_t *const limit = b_end - 32; uint64_t v1 = seed + PRIME64_1 + PRIME64_2; uint64_t v2 = seed + PRIME64_2; uint64_t v3 = seed + 0; uint64_t v4 = seed - PRIME64_1; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = seed + PRIME64_5; } h64 += (uint64_t)len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64); /*-************************************************** * Advanced Hash Functions ***************************************************/ void xxh32_reset(struct xxh32_state *statePtr, const uint32_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ struct xxh32_state state; memset(&state, 0, sizeof(state)); state.v1 = seed + PRIME32_1 + PRIME32_2; state.v2 = seed + PRIME32_2; state.v3 = seed + 0; state.v4 = seed - PRIME32_1; memcpy(statePtr, &state, sizeof(state)); } EXPORT_SYMBOL(xxh32_reset); void xxh64_reset(struct xxh64_state *statePtr, const uint64_t seed) { /* use a local state for memcpy() to avoid strict-aliasing warnings */ struct xxh64_state state; memset(&state, 0, sizeof(state)); state.v1 = seed + PRIME64_1 + PRIME64_2; state.v2 = seed + PRIME64_2; state.v3 = seed + 0; state.v4 = seed - PRIME64_1; memcpy(statePtr, &state, sizeof(state)); } EXPORT_SYMBOL(xxh64_reset); int xxh32_update(struct xxh32_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; if (input == NULL) return -EINVAL; state->total_len_32 += (uint32_t)len; state->large_len |= (len >= 16) | (state->total_len_32 >= 16); if (state->memsize + len < 16) { /* fill in tmp buffer */ memcpy((uint8_t *)(state->mem32) + state->memsize, input, len); state->memsize += (uint32_t)len; return 0; } if (state->memsize) { /* some data left from previous update */ const uint32_t *p32 = state->mem32; memcpy((uint8_t *)(state->mem32) + state->memsize, input, 16 - state->memsize); state->v1 = xxh32_round(state->v1, get_unaligned_le32(p32)); p32++; state->v2 = xxh32_round(state->v2, get_unaligned_le32(p32)); p32++; state->v3 = xxh32_round(state->v3, get_unaligned_le32(p32)); p32++; state->v4 = xxh32_round(state->v4, get_unaligned_le32(p32)); p32++; p += 16-state->memsize; state->memsize = 0; } if (p <= b_end - 16) { const uint8_t *const limit = b_end - 16; uint32_t v1 = state->v1; uint32_t v2 = state->v2; uint32_t v3 = state->v3; uint32_t v4 = state->v4; do { v1 = xxh32_round(v1, get_unaligned_le32(p)); p += 4; v2 = xxh32_round(v2, get_unaligned_le32(p)); p += 4; v3 = xxh32_round(v3, get_unaligned_le32(p)); p += 4; v4 = xxh32_round(v4, get_unaligned_le32(p)); p += 4; } while (p <= limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < b_end) { memcpy(state->mem32, p, (size_t)(b_end-p)); state->memsize = (uint32_t)(b_end-p); } return 0; } EXPORT_SYMBOL(xxh32_update); uint32_t xxh32_digest(const struct xxh32_state *state) { const uint8_t *p = (const uint8_t *)state->mem32; const uint8_t *const b_end = (const uint8_t *)(state->mem32) + state->memsize; uint32_t h32; if (state->large_len) { h32 = xxh_rotl32(state->v1, 1) + xxh_rotl32(state->v2, 7) + xxh_rotl32(state->v3, 12) + xxh_rotl32(state->v4, 18); } else { h32 = state->v3 /* == seed */ + PRIME32_5; } h32 += state->total_len_32; while (p + 4 <= b_end) { h32 += get_unaligned_le32(p) * PRIME32_3; h32 = xxh_rotl32(h32, 17) * PRIME32_4; p += 4; } while (p < b_end) { h32 += (*p) * PRIME32_5; h32 = xxh_rotl32(h32, 11) * PRIME32_1; p++; } h32 ^= h32 >> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } EXPORT_SYMBOL(xxh32_digest); int xxh64_update(struct xxh64_state *state, const void *input, const size_t len) { const uint8_t *p = (const uint8_t *)input; const uint8_t *const b_end = p + len; if (input == NULL) return -EINVAL; state->total_len += len; if (state->memsize + len < 32) { /* fill in tmp buffer */ memcpy(((uint8_t *)state->mem64) + state->memsize, input, len); state->memsize += (uint32_t)len; return 0; } if (state->memsize) { /* tmp buffer is full */ uint64_t *p64 = state->mem64; memcpy(((uint8_t *)p64) + state->memsize, input, 32 - state->memsize); state->v1 = xxh64_round(state->v1, get_unaligned_le64(p64)); p64++; state->v2 = xxh64_round(state->v2, get_unaligned_le64(p64)); p64++; state->v3 = xxh64_round(state->v3, get_unaligned_le64(p64)); p64++; state->v4 = xxh64_round(state->v4, get_unaligned_le64(p64)); p += 32 - state->memsize; state->memsize = 0; } if (p + 32 <= b_end) { const uint8_t *const limit = b_end - 32; uint64_t v1 = state->v1; uint64_t v2 = state->v2; uint64_t v3 = state->v3; uint64_t v4 = state->v4; do { v1 = xxh64_round(v1, get_unaligned_le64(p)); p += 8; v2 = xxh64_round(v2, get_unaligned_le64(p)); p += 8; v3 = xxh64_round(v3, get_unaligned_le64(p)); p += 8; v4 = xxh64_round(v4, get_unaligned_le64(p)); p += 8; } while (p <= limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < b_end) { memcpy(state->mem64, p, (size_t)(b_end-p)); state->memsize = (uint32_t)(b_end - p); } return 0; } EXPORT_SYMBOL(xxh64_update); uint64_t xxh64_digest(const struct xxh64_state *state) { const uint8_t *p = (const uint8_t *)state->mem64; const uint8_t *const b_end = (const uint8_t *)state->mem64 + state->memsize; uint64_t h64; if (state->total_len >= 32) { const uint64_t v1 = state->v1; const uint64_t v2 = state->v2; const uint64_t v3 = state->v3; const uint64_t v4 = state->v4; h64 = xxh_rotl64(v1, 1) + xxh_rotl64(v2, 7) + xxh_rotl64(v3, 12) + xxh_rotl64(v4, 18); h64 = xxh64_merge_round(h64, v1); h64 = xxh64_merge_round(h64, v2); h64 = xxh64_merge_round(h64, v3); h64 = xxh64_merge_round(h64, v4); } else { h64 = state->v3 + PRIME64_5; } h64 += (uint64_t)state->total_len; while (p + 8 <= b_end) { const uint64_t k1 = xxh64_round(0, get_unaligned_le64(p)); h64 ^= k1; h64 = xxh_rotl64(h64, 27) * PRIME64_1 + PRIME64_4; p += 8; } if (p + 4 <= b_end) { h64 ^= (uint64_t)(get_unaligned_le32(p)) * PRIME64_1; h64 = xxh_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p += 4; } while (p < b_end) { h64 ^= (*p) * PRIME64_5; h64 = xxh_rotl64(h64, 11) * PRIME64_1; p++; } h64 ^= h64 >> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } EXPORT_SYMBOL(xxh64_digest); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("xxHash");
3 3 3 3 3 2 1 1 1 1 1 1 1 7 8 8 8 8 8 8 11 11 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 11 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
11468 11711 2320 2317 296 297 814 15388 2199 570 710 585 570 39 39 2 13 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_FIND_H_ #define __LINUX_FIND_H_ #ifndef __LINUX_BITMAP_H #error only <linux/bitmap.h> can be included directly #endif #include <linux/bitops.h> unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits, unsigned long start); unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start); unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits, unsigned long start); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n); unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n); unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n); extern unsigned long _find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size); unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifdef __BIG_ENDIAN unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size); unsigned long _find_next_zero_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); unsigned long _find_next_bit_le(const unsigned long *addr, unsigned long size, unsigned long offset); #endif #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit(addr, size, offset); } #endif #ifndef find_next_and_bit /** * find_next_and_bit - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & *addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_and_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_andnot_bit /** * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits * in *addr2 * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr1 & ~*addr2 & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_andnot_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_or_bit /** * find_next_or_bit - find the next set bit in either memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = (*addr1 | *addr2) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_or_bit(addr1, addr2, size, offset); } #endif #ifndef find_next_zero_bit /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val; if (unlikely(offset >= size)) return size; val = *addr | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit(addr, size, offset); } #endif #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_bit(addr, size); } #endif /** * find_nth_bit - find N'th set bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * The following is semantically equivalent: * idx = find_nth_bit(addr, size, 0); * idx = find_first_bit(addr, size); * * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_bit(addr, size, n); } /** * find_nth_and_bit - find N'th set bit in 2 memory regions * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_bit(addr1, addr2, size, n); } /** * find_nth_andnot_bit - find N'th set bit in 2 memory regions, * flipping bits in 2nd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & (~*addr2) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_andnot_bit(addr1, addr2, size, n); } /** * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions, * excluding those set in 3rd region * @addr1: The 1st address to start the search at * @addr2: The 2nd address to start the search at * @addr3: The 3rd address to start the search at * @size: The maximum number of bits to search * @n: The number of set bit, which position is needed, counting from 0 * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ static __always_inline unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size, unsigned long n) { if (n >= size) return size; if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0); return val ? fns(val, n) : size; } return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n); } #ifndef find_first_and_bit /** * find_first_and_bit - find the first set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the next set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_bit(addr1, addr2, size); } #endif /** * find_first_and_and_bit - find the first set bit in 3 memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @addr3: The third address to base the search on * @size: The bitmap size in bits * * Returns the bit number for the first set bit * If no bits are set, returns @size. */ static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0); return val ? __ffs(val) : size; } return _find_first_and_and_bit(addr1, addr2, addr3, size); } #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at * @size: The maximum number of bits to search * * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit(addr, size); } #endif #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region * @addr: The address to start the search at * @size: The number of bits to search * * Returns the bit number of the last set bit, or size. */ static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = *addr & GENMASK(size - 1, 0); return val ? __fls(val) : size; } return _find_last_bit(addr, size); } #endif /** * find_next_and_bit_wrap - find the next set bit in both memory regions * @addr1: The first address to base the search on * @addr2: The second address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { unsigned long bit = find_next_and_bit(addr1, addr2, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_and_bit(addr1, addr2, offset); return bit < offset ? bit : size; } /** * find_next_bit_wrap - find the next set bit in a memory region * @addr: The address to base the search on * @size: The bitmap size in bits * @offset: The bitnumber to start searching at * * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { unsigned long bit = find_next_bit(addr, size, offset); if (bit < size || offset == 0) return bit; bit = find_first_bit(addr, offset); return bit < offset ? bit : size; } /* * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { unsigned long bit; /* If not wrapped around */ if (n > start) { /* and have a bit, just return it. */ bit = find_next_bit(bitmap, size, n); if (bit < size) return bit; /* Otherwise, wrap around and ... */ n = 0; } /* Search the other part. */ bit = find_next_bit(bitmap, start, n); return bit < start ? bit : size; } /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump * @addr: address to base the search on * @size: bitmap size in number of bits * @offset: bit offset at which to start searching * * Returns the bit offset for the next set clump; the found clump value is * copied to the location pointed by @clump. If no bits are set, returns @size. */ extern unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset); #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) #if defined(__LITTLE_ENDIAN) static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) | ~GENMASK(size - 1, offset); return val == ~0UL ? size : ffz(val); } return _find_next_zero_bit_le(addr, size, offset); } #endif #ifndef find_first_zero_bit_le static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0); return val == ~0UL ? size : ffz(val); } return _find_first_zero_bit_le(addr, size); } #endif #ifndef find_next_bit_le static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { if (small_const_nbits(size)) { unsigned long val = *(const unsigned long *)addr; if (unlikely(offset >= size)) return size; val = swab(val) & GENMASK(size - 1, offset); return val ? __ffs(val) : size; } return _find_next_bit_le(addr, size, offset); } #endif #else #error "Please fix <asm/byteorder.h>" #endif #define for_each_set_bit(bit, addr, size) \ for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_and_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_andnot_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) #define for_each_or_bit(bit, addr1, addr2, size) \ for ((bit) = 0; \ (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\ (bit)++) /* same as for_each_set_bit() but use bit as value to start with */ #define for_each_set_bit_from(bit, addr, size) \ for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++) #define for_each_clear_bit(bit, addr, size) \ for ((bit) = 0; \ (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); \ (bit)++) /* same as for_each_clear_bit() but use bit as value to start with */ #define for_each_clear_bit_from(bit, addr, size) \ for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++) /** * for_each_set_bitrange - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit) * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_bit((addr), (size), b), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_bit((addr), (size), (b)), \ (e) = find_next_zero_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first unset bit) * @e: bit offset of end of current bitrange (first set bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange(b, e, addr, size) \ for ((b) = 0; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) * @b: bit offset of start of current bitrange (first set bit); must be initialized * @e: bit offset of end of current bitrange (first unset bit) * @addr: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_clear_bitrange_from(b, e, addr, size) \ for (; \ (b) = find_next_zero_bit((addr), (size), (b)), \ (e) = find_next_bit((addr), (size), (b) + 1), \ (b) < (size); \ (b) = (e) + 1) /** * for_each_set_bit_wrap - iterate over all set bits starting from @start, and * wrapping around the end of bitmap. * @bit: offset for current iteration * @addr: bitmap address to base the search on * @size: bitmap size in number of bits * @start: Starting bit for bitmap traversing, wrapping around the bitmap end */ #define for_each_set_bit_wrap(bit, addr, size, start) \ for ((bit) = find_next_bit_wrap((addr), (size), (start)); \ (bit) < (size); \ (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1)) /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset * @clump: location to store copy of current 8-bit clump * @bits: bitmap address to base the search on * @size: bitmap size in number of bits */ #define for_each_set_clump8(start, clump, bits, size) \ for ((start) = find_first_clump8(&(clump), (bits), (size)); \ (start) < (size); \ (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) #endif /*__LINUX_FIND_H_ */
25 16 18 25 20 25 24 25 15 15 15 15 29 15 15 28 29 25 21 24 24 25 25 25 25 20 24 24 24 27 25 4 24 27 3 24 24 19 19 2 18 19 22 12 5 24 1 10 4 7 1 4 10 10 22 1 26 11 4 23 23 1 23 23 23 16 23 23 11 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Policy advertisement to userspace * * Authors: Johannes Berg <johannes@sipsolutions.net> * * Copyright 2019 Intel Corporation */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/types.h> #include <net/netlink.h> #define INITIAL_POLICIES_ALLOC 10 struct netlink_policy_dump_state { unsigned int policy_idx; unsigned int attr_idx; unsigned int n_alloc; struct { const struct nla_policy *policy; unsigned int maxtype; } policies[] __counted_by(n_alloc); }; static int add_policy(struct netlink_policy_dump_state **statep, const struct nla_policy *policy, unsigned int maxtype) { struct netlink_policy_dump_state *state = *statep; unsigned int old_n_alloc, n_alloc, i; if (!policy || !maxtype) return 0; for (i = 0; i < state->n_alloc; i++) { if (state->policies[i].policy == policy && state->policies[i].maxtype == maxtype) return 0; if (!state->policies[i].policy) { state->policies[i].policy = policy; state->policies[i].maxtype = maxtype; return 0; } } n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC; state = krealloc(state, struct_size(state, policies, n_alloc), GFP_KERNEL); if (!state) return -ENOMEM; old_n_alloc = state->n_alloc; state->n_alloc = n_alloc; memset(&state->policies[old_n_alloc], 0, flex_array_size(state, policies, n_alloc - old_n_alloc)); state->policies[old_n_alloc].policy = policy; state->policies[old_n_alloc].maxtype = maxtype; *statep = state; return 0; } /** * netlink_policy_dump_get_policy_idx - retrieve policy index * @state: the policy dump state * @policy: the policy to find * @maxtype: the policy's maxattr * * Returns: the index of the given policy in the dump state * * Call this to find a policy index when you've added multiple and e.g. * need to tell userspace which command has which policy (by index). * * Note: this will WARN and return 0 if the policy isn't found, which * means it wasn't added in the first place, which would be an * internal consistency bug. */ int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype) { unsigned int i; if (WARN_ON(!policy || !maxtype)) return 0; for (i = 0; i < state->n_alloc; i++) { if (state->policies[i].policy == policy && state->policies[i].maxtype == maxtype) return i; } WARN_ON(1); return 0; } static struct netlink_policy_dump_state *alloc_state(void) { struct netlink_policy_dump_state *state; state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), GFP_KERNEL); if (!state) return ERR_PTR(-ENOMEM); state->n_alloc = INITIAL_POLICIES_ALLOC; return state; } /** * netlink_policy_dump_add_policy - add a policy to the dump * @pstate: state to add to, may be reallocated, must be %NULL the first time * @policy: the new policy to add to the dump * @maxtype: the new policy's max attr type * * Returns: 0 on success, a negative error code otherwise. * * Call this to allocate a policy dump state, and to add policies to it. This * should be called from the dump start() callback. * * Note: on failures, any previously allocated state is freed. */ int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype) { struct netlink_policy_dump_state *state = *pstate; unsigned int policy_idx; int err; if (!state) { state = alloc_state(); if (IS_ERR(state)) return PTR_ERR(state); } /* * walk the policies and nested ones first, and build * a linear list of them. */ err = add_policy(&state, policy, maxtype); if (err) goto err_try_undo; for (policy_idx = 0; policy_idx < state->n_alloc && state->policies[policy_idx].policy; policy_idx++) { const struct nla_policy *policy; unsigned int type; policy = state->policies[policy_idx].policy; for (type = 0; type <= state->policies[policy_idx].maxtype; type++) { switch (policy[type].type) { case NLA_NESTED: case NLA_NESTED_ARRAY: err = add_policy(&state, policy[type].nested_policy, policy[type].len); if (err) goto err_try_undo; break; default: break; } } } *pstate = state; return 0; err_try_undo: /* Try to preserve reasonable unwind semantics - if we're starting from * scratch clean up fully, otherwise record what we got and caller will. */ if (!*pstate) netlink_policy_dump_free(state); else *pstate = state; return err; } static bool netlink_policy_dump_finished(struct netlink_policy_dump_state *state) { return state->policy_idx >= state->n_alloc || !state->policies[state->policy_idx].policy; } /** * netlink_policy_dump_loop - dumping loop indicator * @state: the policy dump state * * Returns: %true if the dump continues, %false otherwise * * Note: this frees the dump state when finishing */ bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state) { return !netlink_policy_dump_finished(state); } int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt) { /* nested + type */ int common = 2 * nla_attr_size(sizeof(u32)); switch (pt->type) { case NLA_UNSPEC: case NLA_REJECT: /* these actually don't need any space */ return 0; case NLA_NESTED: case NLA_NESTED_ARRAY: /* common, policy idx, policy maxattr */ return common + 2 * nla_attr_size(sizeof(u32)); case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_MSECS: case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: case NLA_UINT: /* maximum is common, u64 min/max with padding */ return common + 2 * (nla_attr_size(0) + nla_attr_size(sizeof(u64))); case NLA_BITFIELD32: return common + nla_attr_size(sizeof(u32)); case NLA_STRING: case NLA_NUL_STRING: case NLA_BINARY: /* maximum is common, u32 min-length/max-length */ return common + 2 * nla_attr_size(sizeof(u32)); case NLA_FLAG: return common; } /* this should then cause a warning later */ return 0; } static int __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, struct sk_buff *skb, const struct nla_policy *pt, int nestattr) { int estimate = netlink_policy_dump_attr_size_estimate(pt); enum netlink_attribute_type type; struct nlattr *attr; attr = nla_nest_start(skb, nestattr); if (!attr) return -ENOBUFS; switch (pt->type) { default: case NLA_UNSPEC: case NLA_REJECT: /* skip - use NLA_MIN_LEN to advertise such */ nla_nest_cancel(skb, attr); return -ENODATA; case NLA_NESTED: type = NL_ATTR_TYPE_NESTED; fallthrough; case NLA_NESTED_ARRAY: if (pt->type == NLA_NESTED_ARRAY) type = NL_ATTR_TYPE_NESTED_ARRAY; if (state && pt->nested_policy && pt->len && (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX, netlink_policy_dump_get_policy_idx(state, pt->nested_policy, pt->len)) || nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, pt->len))) goto nla_put_failure; break; case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: { struct netlink_range_validation range; if (pt->type == NLA_U8) type = NL_ATTR_TYPE_U8; else if (pt->type == NLA_U16) type = NL_ATTR_TYPE_U16; else if (pt->type == NLA_U32) type = NL_ATTR_TYPE_U32; else if (pt->type == NLA_U64) type = NL_ATTR_TYPE_U64; else type = NL_ATTR_TYPE_UINT; if (pt->validation_type == NLA_VALIDATE_MASK) { if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MASK, pt->mask, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; } nla_get_range_unsigned(pt, &range); if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, range.min, NL_POLICY_TYPE_ATTR_PAD) || nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U, range.max, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; } case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: { struct netlink_range_validation_signed range; if (pt->type == NLA_S8) type = NL_ATTR_TYPE_S8; else if (pt->type == NLA_S16) type = NL_ATTR_TYPE_S16; else if (pt->type == NLA_S32) type = NL_ATTR_TYPE_S32; else if (pt->type == NLA_S64) type = NL_ATTR_TYPE_S64; else type = NL_ATTR_TYPE_SINT; nla_get_range_signed(pt, &range); if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, range.min, NL_POLICY_TYPE_ATTR_PAD) || nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S, range.max, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; } case NLA_BITFIELD32: type = NL_ATTR_TYPE_BITFIELD32; if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, pt->bitfield32_valid)) goto nla_put_failure; break; case NLA_STRING: case NLA_NUL_STRING: case NLA_BINARY: if (pt->type == NLA_STRING) type = NL_ATTR_TYPE_STRING; else if (pt->type == NLA_NUL_STRING) type = NL_ATTR_TYPE_NUL_STRING; else type = NL_ATTR_TYPE_BINARY; if (pt->validation_type == NLA_VALIDATE_RANGE || pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG) { struct netlink_range_validation range; nla_get_range_unsigned(pt, &range); if (range.min && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, range.min)) goto nla_put_failure; if (range.max < U16_MAX && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, range.max)) goto nla_put_failure; } else if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) { goto nla_put_failure; } break; case NLA_FLAG: type = NL_ATTR_TYPE_FLAG; break; } if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type)) goto nla_put_failure; nla_nest_end(skb, attr); WARN_ON(attr->nla_len > estimate); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -ENOBUFS; } /** * netlink_policy_dump_write_attr - write a given attribute policy * @skb: the message skb to write to * @pt: the attribute's policy * @nestattr: the nested attribute ID to use * * Returns: 0 on success, an error code otherwise; -%ENODATA is * special, indicating that there's no policy data and * the attribute is generally rejected. */ int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr) { return __netlink_policy_dump_write_attr(NULL, skb, pt, nestattr); } /** * netlink_policy_dump_write - write current policy dump attributes * @skb: the message skb to write to * @state: the policy dump state * * Returns: 0 on success, an error code otherwise */ int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state) { const struct nla_policy *pt; struct nlattr *policy; bool again; int err; send_attribute: again = false; pt = &state->policies[state->policy_idx].policy[state->attr_idx]; policy = nla_nest_start(skb, state->policy_idx); if (!policy) return -ENOBUFS; err = __netlink_policy_dump_write_attr(state, skb, pt, state->attr_idx); if (err == -ENODATA) { nla_nest_cancel(skb, policy); again = true; goto next; } else if (err) { goto nla_put_failure; } /* finish and move state to next attribute */ nla_nest_end(skb, policy); next: state->attr_idx += 1; if (state->attr_idx > state->policies[state->policy_idx].maxtype) { state->attr_idx = 0; state->policy_idx++; } if (again) { if (netlink_policy_dump_finished(state)) return -ENODATA; goto send_attribute; } return 0; nla_put_failure: nla_nest_cancel(skb, policy); return -ENOBUFS; } /** * netlink_policy_dump_free - free policy dump state * @state: the policy dump state to free * * Call this from the done() method to ensure dump state is freed. */ void netlink_policy_dump_free(struct netlink_policy_dump_state *state) { kfree(state); }
114 189 119 71 72 61 61 12 54 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 /* * net/tipc/name_distr.c: TIPC name distribution code * * Copyright (c) 2000-2006, 2014-2019, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "name_distr.h" int sysctl_tipc_named_timeout __read_mostly = 2000; /** * publ_to_item - add publication info to a publication message * @p: publication info * @i: location of item in the message */ static void publ_to_item(struct distr_item *i, struct publication *p) { i->type = htonl(p->sr.type); i->lower = htonl(p->sr.lower); i->upper = htonl(p->sr.upper); i->port = htonl(p->sk.ref); i->key = htonl(p->key); } /** * named_prepare_buf - allocate & initialize a publication message * @net: the associated network namespace * @type: message type * @size: payload size * @dest: destination node * * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); u32 self = tipc_own_addr(net); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); tipc_msg_init(self, msg, NAME_DISTRIBUTOR, type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; } /** * tipc_named_publish - tell other nodes about a new publication by this node * @net: the associated network namespace * @p: the new publication */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; if (p->scope == TIPC_NODE_SCOPE) { list_add_tail_rcu(&p->binding_node, &nt->node_scope); return NULL; } write_lock_bh(&nt->cluster_scope_lock); list_add_tail(&p->binding_node, &nt->cluster_scope); write_unlock_bh(&nt->cluster_scope_lock); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { pr_warn("Publication distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node * @net: the associated network namespace * @p: the withdrawn publication */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); list_del(&p->binding_node); write_unlock_bh(&nt->cluster_scope_lock); if (p->scope == TIPC_NODE_SCOPE) return NULL; skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); if (!skb) { pr_warn("Withdrawal distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * named_distribute - prepare name info for bulk distribution to another node * @net: the associated network namespace * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain * @seqno: sequence number for this message */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls, u16 seqno) { struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; struct tipc_msg *hdr; list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, dnode); if (!skb) { pr_warn("Bulk publication failure\n"); return; } hdr = buf_msg(skb); msg_set_bc_ack_invalid(hdr, true); msg_set_bulk(hdr); msg_set_non_legacy(hdr); item = (struct distr_item *)msg_data(hdr); } /* Pack publication into message: */ publ_to_item(item, publ); item++; msg_rem -= ITEM_SIZE; /* Append full buffer to list: */ if (!msg_rem) { __skb_queue_tail(list, skb); skb = NULL; msg_rem = msg_dsz; } } if (skb) { hdr = buf_msg(skb); msg_set_size(hdr, INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } hdr = buf_msg(skb_peek_tail(list)); msg_set_last_bulk(hdr); msg_set_named_seqno(hdr, seqno); } /** * tipc_named_node_up - tell specified node about all publications by this node * @net: the associated network namespace * @dnode: destination node * @capabilities: peer node's capabilities */ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff_head head; u16 seqno; __skb_queue_head_init(&head); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests++; seqno = nt->snd_nxt; spin_unlock_bh(&tn->nametbl_lock); read_lock_bh(&nt->cluster_scope_lock); named_distribute(net, &head, dnode, &nt->cluster_scope, seqno); tipc_node_xmit(net, &head, dnode, 0); read_unlock_bh(&nt->cluster_scope_lock); } /** * tipc_publ_purge - remove publication associated with a failed node * @net: the associated network namespace * @p: the publication to remove * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. */ static void tipc_publ_purge(struct net *net, struct publication *p, u32 addr) { struct tipc_net *tn = tipc_net(net); struct publication *_p; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, p->scope, p->sr.type, p->sr.lower, p->sr.upper); spin_lock_bh(&tn->nametbl_lock); _p = tipc_nametbl_remove_publ(net, &ua, &p->sk, p->key); if (_p) tipc_node_unsubscribe(net, &_p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (_p) kfree_rcu(_p, rcu); } void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests--; spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers * @net: the associated network namespace * @i: location of item in the message * @node: node address * @dtype: name distributor message type * * tipc_nametbl_lock must be held. * Return: the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { struct publication *p = NULL; struct tipc_socket_addr sk; struct tipc_uaddr ua; u32 key = ntohl(i->key); tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, ntohl(i->type), ntohl(i->lower), ntohl(i->upper)); sk.ref = ntohl(i->port); sk.node = node; if (dtype == PUBLICATION) { p = tipc_nametbl_insert_publ(net, &ua, &sk, key); if (p) { tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { p = tipc_nametbl_remove_publ(net, &ua, &sk, key); if (p) { tipc_node_unsubscribe(net, &p->binding_node, node); kfree_rcu(p, rcu); return true; } pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n", ua.sr.type, ua.sr.lower, node); } else { pr_warn_ratelimited("Unknown name table message received\n"); } return false; } static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct sk_buff *skb, *tmp; struct tipc_msg *hdr; u16 seqno; spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { if (unlikely(skb_linearize(skb))) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { *rcv_nxt = seqno; *open = true; } if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (less(seqno, *rcv_nxt)) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } } spin_unlock_bh(&namedq->lock); return NULL; } /** * tipc_named_rcv - process name table update messages sent by another node * @net: the associated network namespace * @namedq: queue to receive from * @rcv_nxt: store last received seqno here * @open: last bulk msg was received (FIXME) */ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct tipc_net *tn = tipc_net(net); struct distr_item *item; struct tipc_msg *hdr; struct sk_buff *skb; u32 count, node; spin_lock_bh(&tn->nametbl_lock); while ((skb = tipc_named_dequeue(namedq, rcv_nxt, open))) { hdr = buf_msg(skb); node = msg_orignode(hdr); item = (struct distr_item *)msg_data(hdr); count = msg_data_sz(hdr) / ITEM_SIZE; while (count--) { tipc_update_nametbl(net, item, node, msg_type(hdr)); item++; } kfree_skb(skb); } spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_named_reinit - re-initialize local publications * @net: the associated network namespace * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect * the node's new network address. */ void tipc_named_reinit(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p; u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); list_for_each_entry_rcu(p, &nt->node_scope, binding_node) p->sk.node = self; list_for_each_entry_rcu(p, &nt->cluster_scope, binding_node) p->sk.node = self; nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); }
11 10 24 10 17 277 7 466 465 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ #include <net/page_pool/types.h> /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); /** * __xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * @try_coalesce: whether to try coalescing the frags (not valid for XSk) * * Attach frag to the XDP buffer. If it currently has no frags attached, * initialize the related fields, otherwise check that the frag number * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing * the frag with the previous one. * The function doesn't check/update the pfmemalloc bit. Please use the * non-underscored wrapper in drivers. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize, bool try_coalesce) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *prev; u32 nr_frags; if (!xdp_buff_has_frags(xdp)) { xdp_buff_set_frags_flag(xdp); nr_frags = 0; sinfo->xdp_frags_size = 0; sinfo->xdp_frags_truesize = 0; goto fill; } nr_frags = sinfo->nr_frags; prev = &sinfo->frags[nr_frags - 1]; if (try_coalesce && netmem == skb_frag_netmem(prev) && offset == skb_frag_off(prev) + skb_frag_size(prev)) { skb_frag_size_add(prev, size); /* Guaranteed to only decrement the refcount */ xdp_return_frag(netmem, xdp); } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { return false; } else { fill: __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, offset, size); } sinfo->nr_frags = nr_frags; sinfo->xdp_frags_size += size; sinfo->xdp_frags_truesize += truesize; return true; } /** * xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize) { if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) return false; if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); return true; } struct xdp_frame { void *data; u32 len; u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem_type is valid on remote CPU. */ enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { bq->count = 0; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { struct skb_shared_info *sinfo = skb_shinfo(skb); sinfo->nr_frags = nr_frags; /* * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, * reset it after that these fields aren't used anymore. */ sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(const struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { if (unlikely(!bq->count)) return; page_pool_put_netmem_bulk(bq->q, bq->count); bq->count = 0; } static __always_inline unsigned int xdp_get_frame_len(const struct xdp_frame *xdpf) { const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); int xdp_reg_page_pool(struct page_pool *pool); void xdp_unreg_page_pool(const struct page_pool *pool); void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, const struct page_pool *pool); /** * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info * @xdp_rxq: XDP RxQ info to attach the memory info to * @mem: already registered memory info * * If the driver registers its memory providers manually, it must use this * function instead of xdp_rxq_info_reg_mem_model(). */ static inline void xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, const struct xdp_mem_info *mem) { xdp_rxq->mem = *mem; } /** * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info * @xdp_rxq: XDP RxQ info to detach the memory info from * * If the driver registers its memory providers manually and then attaches it * via xdp_rxq_info_attach_mem_model(), it must call this function before * xdp_rxq_info_unreg(). */ static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) { xdp_rxq->mem = (struct xdp_mem_info){ }; } /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_set_redirect_target_locked(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
15 16 15 9 14 1563 1469 1472 757 758 752 5 207 27 217 221 5 9 4 3 146 1 379 309 213 310 125 22 386 9 382 209 4 204 203 389 390 210 390 71 70 13 388 390 390 1 1 1 1 1 1 1 25 1 24 22 1 18 4 14 1 3 12 5 12 5 17 6 6 2 12 12 10 10 8 1 8 8 7 6 6 26 1 26 27 22 8 22 23 23 23 110 110 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 // SPDX-License-Identifier: GPL-2.0-only /* * net/ipv6/fib6_rules.c IPv6 Routing Policy Rules * * Copyright (C)2003-2006 Helsinki University of Technology * Copyright (C)2003-2006 USAGI/WIDE Project * * Authors * Thomas Graf <tgraf@suug.ch> * Ville Nuorvala <vnuorval@tcs.hut.fi> */ #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/export.h> #include <linux/indirect_call_wrapper.h> #include <net/fib_rules.h> #include <net/inet_dscp.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/netlink.h> struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; __be32 flowlabel; __be32 flowlabel_mask; dscp_t dscp; dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ }; static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } bool fib6_rule_default(const struct fib_rule *rule) { if (!fib6_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || rule->l3mdev) return false; if (rule->table != RT6_TABLE_LOCAL && rule->table != RT6_TABLE_MAIN) return false; return true; } EXPORT_SYMBOL_GPL(fib6_rule_default); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, AF_INET6, extack); } unsigned int fib6_rules_seq_read(const struct net *net) { return fib_rules_seq_read(net, AF_INET6); } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { int err; if (net->ipv6.fib6_has_custom_rules) { struct fib_lookup_arg arg = { .lookup_ptr = fib6_table_lookup, .lookup_data = &oif, .result = res, .flags = FIB_LOOKUP_NOREF, }; l3mdev_update_flow(net, flowi6_to_flowi(fl6)); err = fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); } else { err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif, fl6, res, flags); if (err || res->f6i == net->ipv6.fib6_null_entry) err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } return err; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { if (net->ipv6.fib6_has_custom_rules) { struct fib6_result res = {}; struct fib_lookup_arg arg = { .lookup_ptr = lookup, .lookup_data = skb, .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(fl6)); fib_rules_lookup(net->ipv6.fib6_rules_ops, flowi6_to_flowi(fl6), flags, &arg); if (res.rt6) return &res.rt6->dst; } else { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_local_tbl, fl6, skb, flags); if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error != -EAGAIN) return &rt->dst; ip6_rt_put_flags(rt, flags); } if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&net->ipv6.ip6_null_entry->dst); return &net->ipv6.ip6_null_entry->dst; } static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags, struct flowi6 *flp6, const struct net_device *dev) { struct fib6_rule *r = (struct fib6_rule *)rule; /* If we need to find a source address for this traffic, * we check the result if it meets requirement of the rule. */ if ((rule->flags & FIB_RULE_FIND_SADDR) && r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) { struct in6_addr saddr; if (ipv6_dev_get_saddr(net, dev, &flp6->daddr, rt6_flags2srcprefs(flags), &saddr)) return -EAGAIN; if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen)) return -EAGAIN; flp6->saddr = saddr; } return 0; } static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct net *net = rule->fr_net; struct fib6_table *table; int err, *oif; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) return -EAGAIN; oif = (int *)arg->lookup_data; err = fib6_table_lookup(net, table, *oif, flp6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) err = fib6_rule_saddr(net, rule, flags, flp6, res->nh->fib_nh_dev); else err = -EAGAIN; return err; } static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct flowi6 *flp6 = &flp->u.ip6; struct rt6_info *rt = NULL; struct fib6_table *table; struct net *net = rule->fr_net; pol_lookup_t lookup = arg->lookup_ptr; int err = 0; u32 tb_id; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: err = -ENETUNREACH; rt = net->ipv6.ip6_null_entry; goto discard_pkt; default: case FR_ACT_BLACKHOLE: err = -EINVAL; rt = net->ipv6.ip6_blk_hole_entry; goto discard_pkt; case FR_ACT_PROHIBIT: err = -EACCES; rt = net->ipv6.ip6_prohibit_entry; goto discard_pkt; } tb_id = fib_rule_get_table(rule, arg); table = fib6_get_table(net, tb_id); if (!table) { err = -EAGAIN; goto out; } rt = pol_lookup_func(lookup, net, table, flp6, arg->lookup_data, flags); if (rt != net->ipv6.ip6_null_entry) { struct inet6_dev *idev = ip6_dst_idev(&rt->dst); if (!idev) goto again; err = fib6_rule_saddr(net, rule, flags, flp6, idev->dev); if (err == -EAGAIN) goto again; err = rt->dst.error; if (err != -EAGAIN) goto out; } again: ip6_rt_put_flags(rt, flags); err = -EAGAIN; rt = NULL; goto out; discard_pkt: if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); out: res->rt6 = rt; return err; } INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { if (arg->lookup_ptr == fib6_table_lookup) return fib6_rule_action_alt(rule, flp, flags, arg); return __fib6_rule_action(rule, flp, flags, arg); } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; struct rt6_info *rt = res->rt6; struct net_device *dev = NULL; if (!rt) return false; if (rt->rt6i_idev) dev = rt->rt6i_idev->dev; /* do not accept result if the route does * not meet the required prefix length */ if (rt->rt6i_dst.plen <= rule->suppress_prefixlen) goto suppress_route; /* do not accept result if the route uses a device * belonging to a forbidden interface group */ if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup) goto suppress_route; return false; suppress_route: ip6_rt_put_flags(rt, flags); return true; } INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib6_rule *r = (struct fib6_rule *) rule; struct flowi6 *fl6 = &fl->u.ip6; if (r->dst.plen && !ipv6_prefix_equal(&fl6->daddr, &r->dst.addr, r->dst.plen)) return 0; /* * If FIB_RULE_FIND_SADDR is set and we do not have a * source address for the traffic, we defer check for * source address. */ if (r->src.plen) { if (flags & RT6_LOOKUP_F_HAS_SADDR) { if (!ipv6_prefix_equal(&fl6->saddr, &r->src.addr, r->src.plen)) return 0; } else if (!(r->common.flags & FIB_RULE_FIND_SADDR)) return 0; } if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) return 0; if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, fl6->fl6_sport)) return 0; if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, fl6->fl6_dport)) return 0; return 1; } static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { if (rule6->dscp) { NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP"); return -EINVAL; } rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule6->dscp_full = true; return 0; } static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { dscp_t dscp_mask; if (!rule6->dscp_full) { NL_SET_ERR_MSG_ATTR(extack, nla, "Cannot specify DSCP mask without DSCP value"); return -EINVAL; } dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); if (rule6->dscp & ~dscp_mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); return -EINVAL; } rule6->dscp_mask = dscp_mask; return 0; } static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { __be32 flowlabel, flowlabel_mask; if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) return -EINVAL; flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], "Invalid flow label mask"); return -EINVAL; } if (flowlabel & ~flowlabel_mask) { NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); return -EINVAL; } rule6->flowlabel = flowlabel; rule6->flowlabel_mask = flowlabel_mask; return 0; } static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct fib6_rule *rule6 = (struct fib6_rule *)rule; struct net *net = rule->fr_net; int err = -EINVAL; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): ECN bits must be 0"); goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; if (tb[FRA_DSCP_MASK] && fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) goto errout; if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) goto errout; if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); goto errout; } if (fib6_new_table(net, rule->table) == NULL) { err = -ENOBUFS; goto errout; } } if (frh->src_len) rule6->src.addr = nla_get_in6_addr(tb[FRA_SRC]); if (frh->dst_len) rule6->dst.addr = nla_get_in6_addr(tb[FRA_DST]); rule6->src.plen = frh->src_len; rule6->dst.plen = frh->dst_len; if (fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect++; net->ipv6.fib6_has_custom_rules = true; err = 0; errout: return err; } static int fib6_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; if (net->ipv6.fib6_rules_require_fldissect && fib_rule_requires_fldissect(rule)) net->ipv6.fib6_rules_require_fldissect--; return 0; } static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (frh->src_len && (rule6->src.plen != frh->src_len)) return 0; if (frh->dst_len && (rule6->dst.plen != frh->dst_len)) return 0; if (frh->tos && (rule6->dscp_full || inet_dscp_to_dsfield(rule6->dscp) != frh->tos)) return 0; if (tb[FRA_DSCP]) { dscp_t dscp; dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2); if (!rule6->dscp_full || rule6->dscp != dscp) return 0; } if (tb[FRA_DSCP_MASK]) { dscp_t dscp_mask; dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) return 0; } if (tb[FRA_FLOWLABEL] && nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) return 0; if (tb[FRA_FLOWLABEL_MASK] && nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) return 0; if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; if (frh->dst_len && nla_memcmp(tb[FRA_DST], &rule6->dst.addr, sizeof(struct in6_addr))) return 0; return 1; } static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { struct fib6_rule *rule6 = (struct fib6_rule *) rule; frh->dst_len = rule6->dst.plen; frh->src_len = rule6->src.plen; if (rule6->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, inet_dscp_to_dsfield(rule6->dscp) >> 2) || nla_put_u8(skb, FRA_DSCP_MASK, inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule6->dscp); } if (rule6->flowlabel_mask && (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) goto nla_put_failure; if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && nla_put_in6_addr(skb, FRA_SRC, &rule6->src.addr))) goto nla_put_failure; return 0; nla_put_failure: return -ENOBUFS; } static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ + nla_total_size(1) /* dscp */ + nla_total_size(1) /* dscp mask */ + nla_total_size(4) /* flowlabel */ + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) { rt_genid_bump_ipv6(ops->fro_net); } static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), .action = fib6_rule_action, .match = fib6_rule_match, .suppress = fib6_rule_suppress, .configure = fib6_rule_configure, .delete = fib6_rule_delete, .compare = fib6_rule_compare, .fill = fib6_rule_fill, .nlmsg_payload = fib6_rule_nlmsg_payload, .flush_cache = fib6_rule_flush_cache, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, .fro_net = &init_net, }; static int __net_init fib6_rules_net_init(struct net *net) { struct fib_rules_ops *ops; int err; ops = fib_rules_register(&fib6_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL); if (err) goto out_fib6_rules_ops; err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN); if (err) goto out_fib6_rules_ops; net->ipv6.fib6_rules_ops = ops; net->ipv6.fib6_rules_require_fldissect = 0; out: return err; out_fib6_rules_ops: fib_rules_unregister(ops); goto out; } static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { fib_rules_unregister(net->ipv6.fib6_rules_ops); cond_resched(); } rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, .exit_batch = fib6_rules_net_exit_batch, }; int __init fib6_rules_init(void) { return register_pernet_subsys(&fib6_rules_net_ops); } void fib6_rules_cleanup(void) { unregister_pernet_subsys(&fib6_rules_net_ops); }
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_VXLAN_H #define __NET_VXLAN_H 1 #include <linux/if_vlan.h> #include <linux/rhashtable-types.h> #include <net/udp_tunnel.h> #include <net/dst_metadata.h> #include <net/rtnetlink.h> #include <net/switchdev.h> #include <net/nexthop.h> #define IANA_VXLAN_UDP_PORT 4789 #define IANA_VXLAN_GPE_UDP_PORT 4790 /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * I = VXLAN Network Identifier (VNI) present. */ struct vxlanhdr { __be32 vx_flags; __be32 vx_vni; }; /* VXLAN header flags. */ #define VXLAN_HF_VNI cpu_to_be32(BIT(27)) #define VXLAN_N_VID (1u << 24) #define VXLAN_VID_MASK (VXLAN_N_VID - 1) #define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) #define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) #define VNI_HASH_BITS 10 #define VNI_HASH_SIZE (1<<VNI_HASH_BITS) #define FDB_HASH_BITS 8 #define FDB_HASH_SIZE (1<<FDB_HASH_BITS) /* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R|R|R|C| Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) |O| Csum start | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * C = Remote checksum offload bit. When set indicates that the * remote checksum offload data is present. * * O = Offset bit. Indicates the checksum offset relative to * checksum start. * * Csum start = Checksum start divided by two. * * http://tools.ietf.org/html/draft-herbert-vxlan-rco */ /* VXLAN-RCO header flags. */ #define VXLAN_HF_RCO cpu_to_be32(BIT(21)) /* Remote checksum offload header option */ #define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ #define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ #define VXLAN_RCO_SHIFT 1 /* Left shift of start */ #define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) #define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) /* * VXLAN Group Based Policy Extension (VXLAN_F_GBP): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * G = Group Policy ID present. * * D = Don't Learn bit. When set, this bit indicates that the egress * VTEP MUST NOT learn the source address of the encapsulated frame. * * A = Indicates that the group policy has already been applied to * this packet. Policies MUST NOT be applied by devices when the * A bit is set. * * https://tools.ietf.org/html/draft-smith-vxlan-group-policy */ struct vxlanhdr_gbp { u8 vx_flags; #ifdef __LITTLE_ENDIAN_BITFIELD u8 reserved_flags1:3, policy_applied:1, reserved_flags2:2, dont_learn:1, reserved_flags3:1; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags1:1, dont_learn:1, reserved_flags2:2, policy_applied:1, reserved_flags3:3; #else #error "Please fix <asm/byteorder.h>" #endif __be16 policy_id; __be32 vx_vni; }; /* VXLAN-GBP header flags. */ #define VXLAN_HF_GBP cpu_to_be32(BIT(31)) #define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) /* skb->mark mapping * * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define VXLAN_GBP_DONT_LEARN (BIT(6) << 16) #define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) #define VXLAN_GBP_ID_MASK (0xFFFF) #define VXLAN_GBP_MASK (VXLAN_GBP_DONT_LEARN | VXLAN_GBP_POLICY_APPLIED | \ VXLAN_GBP_ID_MASK) /* * VXLAN Generic Protocol Extension (VXLAN_F_GPE): * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | VXLAN Network Identifier (VNI) | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Ver = Version. Indicates VXLAN GPE protocol version. * * P = Next Protocol Bit. The P bit is set to indicate that the * Next Protocol field is present. * * O = OAM Flag Bit. The O bit is set to indicate that the packet * is an OAM packet. * * Next Protocol = This 8 bit field indicates the protocol header * immediately following the VXLAN GPE header. * * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 */ struct vxlanhdr_gpe { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 oam_flag:1, reserved_flags1:1, np_applied:1, instance_applied:1, version:2, reserved_flags2:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved_flags2:2, version:2, instance_applied:1, np_applied:1, reserved_flags1:1, oam_flag:1; #endif u8 reserved_flags3; u8 reserved_flags4; u8 next_protocol; __be32 vx_vni; }; /* VXLAN-GPE header flags. */ #define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) #define VXLAN_HF_NP cpu_to_be32(BIT(26)) #define VXLAN_HF_OAM cpu_to_be32(BIT(24)) #define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ cpu_to_be32(0xff)) struct vxlan_metadata { u32 gbp; }; /* per UDP socket information */ struct vxlan_sock { struct hlist_node hlist; struct socket *sock; struct hlist_head vni_list[VNI_HASH_SIZE]; refcount_t refcnt; u32 flags; }; union vxlan_addr { struct sockaddr_in sin; struct sockaddr_in6 sin6; struct sockaddr sa; }; struct vxlan_rdst { union vxlan_addr remote_ip; __be16 remote_port; u8 offloaded:1; __be32 remote_vni; u32 remote_ifindex; struct net_device *remote_dev; struct list_head list; struct rcu_head rcu; struct dst_cache dst_cache; }; struct vxlan_config { union vxlan_addr remote_ip; union vxlan_addr saddr; __be32 vni; int remote_ifindex; int mtu; __be16 dst_port; u16 port_min; u16 port_max; u8 tos; u8 ttl; __be32 label; enum ifla_vxlan_label_policy label_policy; u32 flags; unsigned long age_interval; unsigned int addrmax; bool no_share; enum ifla_vxlan_df df; struct vxlanhdr reserved_bits; }; enum { VXLAN_VNI_STATS_RX, VXLAN_VNI_STATS_RX_DROPS, VXLAN_VNI_STATS_RX_ERRORS, VXLAN_VNI_STATS_TX, VXLAN_VNI_STATS_TX_DROPS, VXLAN_VNI_STATS_TX_ERRORS, }; struct vxlan_vni_stats { u64 rx_packets; u64 rx_bytes; u64 rx_drops; u64 rx_errors; u64 tx_packets; u64 tx_bytes; u64 tx_drops; u64 tx_errors; }; struct vxlan_vni_stats_pcpu { struct vxlan_vni_stats stats; struct u64_stats_sync syncp; }; struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; }; struct vxlan_vni_node { struct rhash_head vnode; struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head vlist; __be32 vni; union vxlan_addr remote_ip; /* default remote ip for this vni */ struct vxlan_vni_stats_pcpu __percpu *stats; struct rcu_head rcu; }; struct vxlan_vni_group { struct rhashtable vni_hash; struct list_head vni_list; u32 num_vnis; }; /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ #endif struct list_head next; /* vxlan's per namespace list */ struct vxlan_sock __rcu *vn4_sock; /* listening socket for IPv4 */ #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock __rcu *vn6_sock; /* listening socket for IPv6 */ #endif struct net_device *dev; struct net *net; /* netns for packet i/o */ struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; struct vxlan_config cfg; struct vxlan_vni_group __rcu *vnigrp; struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; #define VXLAN_F_LEARN 0x01 #define VXLAN_F_PROXY 0x02 #define VXLAN_F_RSC 0x04 #define VXLAN_F_L2MISS 0x08 #define VXLAN_F_L3MISS 0x10 #define VXLAN_F_IPV6 0x20 #define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 #define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 #define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 #define VXLAN_F_REMCSUM_TX 0x200 #define VXLAN_F_REMCSUM_RX 0x400 #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable */ #define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ VXLAN_F_GPE | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ VXLAN_F_IPV6 | \ VXLAN_F_IPV6_LINKLOCAL | \ VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ VXLAN_F_LOCALBYPASS) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) { u8 l4_hdr = 0; if (!skb->encapsulation) return features; switch (vlan_get_protocol(skb)) { case htons(ETH_P_IP): l4_hdr = ip_hdr(skb)->protocol; break; case htons(ETH_P_IPV6): l4_hdr = ipv6_hdr(skb)->nexthdr; break; default: return features; } if ((l4_hdr == IPPROTO_UDP) && (skb->inner_protocol_type != ENCAP_TYPE_ETHER || skb->inner_protocol != htons(ETH_P_TEB) || (skb_inner_mac_header(skb) - skb_transport_header(skb) != sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || (skb->ip_summed != CHECKSUM_NONE && !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); return features; } static inline int vxlan_headroom(u32 flags) { /* VXLAN: IP4/6 header + UDP + VXLAN + Ethernet header */ /* VXLAN-GPE: IP4/6 header + UDP + VXLAN */ return (flags & VXLAN_F_IPV6 ? sizeof(struct ipv6hdr) : sizeof(struct iphdr)) + sizeof(struct udphdr) + sizeof(struct vxlanhdr) + (flags & VXLAN_F_GPE ? 0 : ETH_HLEN); } static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) { return (struct vxlanhdr *)(udp_hdr(skb) + 1); } static inline __be32 vxlan_vni(__be32 vni_field) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni_field >> 8); #else return (__force __be32)((__force u32)(vni_field & VXLAN_VNI_MASK) << 8); #endif } static inline __be32 vxlan_vni_field(__be32 vni) { #if defined(__BIG_ENDIAN) return (__force __be32)((__force u32)vni << 8); #else return (__force __be32)((__force u32)vni >> 8); #endif } static inline size_t vxlan_rco_start(__be32 vni_field) { return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; } static inline size_t vxlan_rco_offset(__be32 vni_field) { return (vni_field & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : offsetof(struct tcphdr, check); } static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) { __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); if (offset == offsetof(struct udphdr, check)) vni_field |= VXLAN_RCO_UDP; return vni_field; } static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) { return vs->sock->sk->sk_family; } #if IS_ENABLED(CONFIG_IPV6) static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_any(&ipa->sin6.sin6_addr); else return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { if (ipa->sa.sa_family == AF_INET6) return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); else return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #else /* !IS_ENABLED(CONFIG_IPV6) */ static inline bool vxlan_addr_any(const union vxlan_addr *ipa) { return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); } static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) { return ipv4_is_multicast(ipa->sin.sin_addr.s_addr); } #endif /* IS_ENABLED(CONFIG_IPV6) */ static inline bool netif_is_vxlan(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "vxlan"); } struct switchdev_notifier_vxlan_fdb_info { struct switchdev_notifier_info info; /* must be first */ union vxlan_addr remote_ip; __be16 remote_port; __be32 remote_vni; u32 remote_ifindex; u8 eth_addr[ETH_ALEN]; __be32 vni; bool offloaded; bool added_by_user; }; #if IS_ENABLED(CONFIG_VXLAN) int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info); int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni); #else static inline int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { return -ENOENT; } static inline int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { } #endif static inline void vxlan_flag_attr_error(int attrtype, struct netlink_ext_ack *extack) { #define VXLAN_FLAG(flg) \ case IFLA_VXLAN_##flg: \ NL_SET_ERR_MSG_MOD(extack, \ "cannot change " #flg " flag"); \ break switch (attrtype) { VXLAN_FLAG(TTL_INHERIT); VXLAN_FLAG(LEARNING); VXLAN_FLAG(PROXY); VXLAN_FLAG(RSC); VXLAN_FLAG(L2MISS); VXLAN_FLAG(L3MISS); VXLAN_FLAG(COLLECT_METADATA); VXLAN_FLAG(UDP_ZERO_CSUM6_TX); VXLAN_FLAG(UDP_ZERO_CSUM6_RX); VXLAN_FLAG(REMCSUM_TX); VXLAN_FLAG(REMCSUM_RX); VXLAN_FLAG(GBP); VXLAN_FLAG(GPE); VXLAN_FLAG(REMCSUM_NOPARTIAL); default: NL_SET_ERR_MSG_MOD(extack, \ "cannot change flag"); break; } #undef VXLAN_FLAG } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; switch (nhc->nhc_gw_family) { case AF_INET: rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; rdst->remote_ip.sa.sa_family = AF_INET; break; case AF_INET6: rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; rdst->remote_ip.sa.sa_family = AF_INET6; break; } return true; } static inline void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, const struct vxlan_metadata *md) { struct vxlanhdr_gbp *gbp; if (!md->gbp) return; gbp = (struct vxlanhdr_gbp *)vxh; vxh->vx_flags |= VXLAN_HF_GBP; if (md->gbp & VXLAN_GBP_DONT_LEARN) gbp->dont_learn = 1; if (md->gbp & VXLAN_GBP_POLICY_APPLIED) gbp->policy_applied = 1; gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); } #endif
6 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0-only /* * This file provides /sys/class/ieee80211/<wiphy name>/ * and some default attributes. * * Copyright 2005-2006 Jiri Benc <jbenc@suse.cz> * Copyright 2006 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2020-2021, 2023-2024 Intel Corporation */ #include <linux/device.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "sysfs.h" #include "core.h" #include "rdev-ops.h" static inline struct cfg80211_registered_device *dev_to_rdev( struct device *dev) { return container_of(dev, struct cfg80211_registered_device, wiphy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; return sprintf(buf, "%s\n", wiphy_name(wiphy)); } static DEVICE_ATTR_RO(name); static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; char *start = buf; int i; if (!wiphy->addresses) return sprintf(buf, "%pM\n", wiphy->perm_addr); for (i = 0; i < wiphy->n_addresses; i++) buf += sprintf(buf, "%pM\n", wiphy->addresses[i].addr); return buf - start; } static DEVICE_ATTR_RO(addresses); static struct attribute *ieee80211_attrs[] = { &dev_attr_index.attr, &dev_attr_macaddress.attr, &dev_attr_address_mask.attr, &dev_attr_addresses.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(ieee80211); static void wiphy_dev_release(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); cfg80211_dev_free(rdev); } #ifdef CONFIG_PM_SLEEP static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } static int wiphy_suspend(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; rdev->suspend_at = ktime_get_boottime_seconds(); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered) { if (!rdev->wiphy.wowlan_config) { cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); } cfg80211_process_wiphy_works(rdev, NULL); if (rdev->ops->suspend) ret = rdev_suspend(rdev, rdev->wiphy.wowlan_config); if (ret == 1) { /* Driver refuse to configure wowlan */ cfg80211_leave_all(rdev); cfg80211_process_rdev_events(rdev); cfg80211_process_wiphy_works(rdev, NULL); ret = rdev_suspend(rdev, NULL); } if (ret == 0) rdev->suspended = true; } wiphy_unlock(&rdev->wiphy); rtnl_unlock(); return ret; } static int wiphy_resume(struct device *dev) { struct cfg80211_registered_device *rdev = dev_to_rdev(dev); int ret = 0; /* Age scan results with time spent in suspend */ cfg80211_bss_age(rdev, ktime_get_boottime_seconds() - rdev->suspend_at); rtnl_lock(); wiphy_lock(&rdev->wiphy); if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; queue_work(system_unbound_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) cfg80211_shutdown_all_interfaces(&rdev->wiphy); rtnl_unlock(); return ret; } static SIMPLE_DEV_PM_OPS(wiphy_pm_ops, wiphy_suspend, wiphy_resume); #define WIPHY_PM_OPS (&wiphy_pm_ops) #else #define WIPHY_PM_OPS NULL #endif static const void *wiphy_namespace(const struct device *d) { struct wiphy *wiphy = container_of(d, struct wiphy, dev); return wiphy_net(wiphy); } struct class ieee80211_class = { .name = "ieee80211", .dev_release = wiphy_dev_release, .dev_groups = ieee80211_groups, .pm = WIPHY_PM_OPS, .ns_type = &net_ns_type_operations, .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) { return class_register(&ieee80211_class); } void wiphy_sysfs_exit(void) { class_unregister(&ieee80211_class); }
81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */
2277 2271 2275 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002 Andi Kleen, SuSE Labs. * Thanks to Ben LaHaise for precious feedback. */ #include <linux/highmem.h> #include <linux/memblock.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> #include <linux/gfp.h> #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/libnvdimm.h> #include <linux/vmstat.h> #include <linux/kernel.h> #include <linux/cc_platform.h> #include <linux/set_memory.h> #include <linux/memregion.h> #include <asm/e820/api.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/sections.h> #include <asm/setup.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/memtype.h> #include "../mm_internal.h" /* * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { unsigned long *vaddr; pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; unsigned long numpages; unsigned long curpage; unsigned long pfn; unsigned int flags; unsigned int force_split : 1, force_static_prot : 1, force_flush_all : 1; struct page **pages; }; enum cpa_warn { CPA_CONFLICT, CPA_PROTECT, CPA_DETECT, }; static const int cpa_warn_level = CPA_PROTECT; /* * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) * using cpa_lock. So that we don't allow any other cpu, with stale large tlb * entries change the page attribute in parallel to some other cpu * splitting a large page entry along with changing the attribute. */ static DEFINE_SPINLOCK(cpa_lock); #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ #define CPA_COLLAPSE 16 /* try to collapse large pages */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; void update_page_count(int level, unsigned long pages) { /* Protect against CPA */ spin_lock(&pgd_lock); direct_pages_count[level] += pages; spin_unlock(&pgd_lock); } static void split_page_count(int level) { if (direct_pages_count[level] == 0) return; direct_pages_count[level]--; if (system_state == SYSTEM_RUNNING) { if (level == PG_LEVEL_2M) count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); else if (level == PG_LEVEL_1G) count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); } direct_pages_count[level - 1] += PTRS_PER_PTE; } static void collapse_page_count(int level) { direct_pages_count[level]++; if (system_state == SYSTEM_RUNNING) { if (level == PG_LEVEL_2M) count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); else if (level == PG_LEVEL_1G) count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); } direct_pages_count[level - 1] -= PTRS_PER_PTE; } void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", direct_pages_count[PG_LEVEL_4K] << 2); #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) seq_printf(m, "DirectMap2M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 11); #else seq_printf(m, "DirectMap4M: %8lu kB\n", direct_pages_count[PG_LEVEL_2M] << 12); #endif if (direct_gbpages) seq_printf(m, "DirectMap1G: %8lu kB\n", direct_pages_count[PG_LEVEL_1G] << 20); } #else static inline void split_page_count(int level) { } static inline void collapse_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS static unsigned long cpa_1g_checked; static unsigned long cpa_1g_sameprot; static unsigned long cpa_1g_preserved; static unsigned long cpa_2m_checked; static unsigned long cpa_2m_sameprot; static unsigned long cpa_2m_preserved; static unsigned long cpa_4k_install; static inline void cpa_inc_1g_checked(void) { cpa_1g_checked++; } static inline void cpa_inc_2m_checked(void) { cpa_2m_checked++; } static inline void cpa_inc_4k_install(void) { data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) { if (level == PG_LEVEL_1G) cpa_1g_sameprot++; else cpa_2m_sameprot++; } static inline void cpa_inc_lp_preserved(int level) { if (level == PG_LEVEL_1G) cpa_1g_preserved++; else cpa_2m_preserved++; } static int cpastats_show(struct seq_file *m, void *p) { seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked); seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot); seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved); seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked); seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot); seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved); seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install); return 0; } static int cpastats_open(struct inode *inode, struct file *file) { return single_open(file, cpastats_show, NULL); } static const struct file_operations cpastats_fops = { .open = cpastats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static int __init cpa_stats_init(void) { debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL, &cpastats_fops); return 0; } late_initcall(cpa_stats_init); #else static inline void cpa_inc_1g_checked(void) { } static inline void cpa_inc_2m_checked(void) { } static inline void cpa_inc_4k_install(void) { } static inline void cpa_inc_lp_sameprot(int level) { } static inline void cpa_inc_lp_preserved(int level) { } #endif static inline int within(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr < end; } #ifdef CONFIG_X86_64 static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): * * 1. The kernel direct map (0xffff880000000000) * 2. The "high kernel map" (0xffffffff81000000) * * We actually execute out of #2. If we get the address of a kernel symbol, it * points to #2, but almost all physical-to-virtual translations point to #1. * * This is so that we can have both a directmap of all physical memory *and* * take full advantage of the limited (s32) immediate addressing range (2G) * of x86_64. * * See Documentation/arch/x86/x86_64/mm.rst for more detail. */ static inline unsigned long highmap_start_pfn(void) { return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { /* Do not reference physical address outside the kernel. */ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT; } static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* * Kernel text has an alias mapping at a high address, known * here as "highmap". */ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn()); } #else static bool __cpa_pfn_in_highmap(unsigned long pfn) { /* There is no highmap on 32-bit */ return false; } #endif /* * See set_mce_nospec(). * * Machine check recovery code needs to change cache mode of poisoned pages to * UC to avoid speculative access logging another error. But passing the * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a * speculative access. So we cheat and flip the top bit of the address. This * works fine for the code that updates the page tables. But at the end of the * process we need to flush the TLB and cache and the non-canonical address * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. * * But in the common case we already have a canonical address. This code * will fix the top bit if needed and is a no-op otherwise. */ static inline unsigned long fix_addr(unsigned long addr) { #ifdef CONFIG_X86_64 return (long)(addr << 1) >> 1; #else return addr; #endif } static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; if (unlikely(PageHighMem(page))) return 0; return (unsigned long)page_address(page); } if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; return *cpa->vaddr + idx * PAGE_SIZE; } /* * Flushing functions */ static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; if (p >= vend) return; for (; p < vend; p += clflush_size) clflushopt(p); } /** * clflush_cache_range - flush a cache range with clflush * @vaddr: virtual start address * @size: number of bytes to flush * * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or * SFENCE to avoid ordering issues. */ void clflush_cache_range(void *vaddr, unsigned int size) { mb(); clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_invalidate_pmem(void *addr, size_t size) { clflush_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_invalidate_pmem); #endif #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION bool cpu_cache_has_invalidate_memregion(void) { return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); } EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); int cpu_cache_invalidate_memregion(int res_desc) { if (WARN_ON_ONCE(!cpu_cache_has_invalidate_memregion())) return -ENXIO; wbinvd_on_all_cpus(); return 0; } EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM"); #endif static void __cpa_flush_all(void *arg) { unsigned long cache = (unsigned long)arg; /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); if (cache && boot_cpu_data.x86 >= 4) wbinvd(); } static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); on_each_cpu(__cpa_flush_all, (void *) cache, 1); } static void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; for (i = 0; i < cpa->numpages; i++) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); static void cpa_collapse_large_pages(struct cpa_data *cpa) { unsigned long start, addr, end; struct ptdesc *ptdesc, *tmp; LIST_HEAD(pgtables); int collapsed = 0; int i; if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { for (i = 0; i < cpa->numpages; i++) collapsed += collapse_large_pages(__cpa_addr(cpa, i), &pgtables); } else { addr = __cpa_addr(cpa, 0); start = addr & PMD_MASK; end = addr + PAGE_SIZE * cpa->numpages; for (addr = start; within(addr, start, end); addr += PMD_SIZE) collapsed += collapse_large_pages(addr, &pgtables); } if (!collapsed) return; flush_tlb_all(); list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { list_del(&ptdesc->pt_list); __free_page(ptdesc_page(ptdesc)); } } static void cpa_flush(struct cpa_data *cpa, int cache) { unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); goto collapse_large_pages; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) flush_tlb_all(); else on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) goto collapse_large_pages; mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; pte_t *pte = lookup_address(addr, &level); /* * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); collapse_large_pages: if (cpa->flags & CPA_COLLAPSE) cpa_collapse_large_pages(cpa); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, unsigned long r2_start, unsigned long r2_end) { return (r1_start <= r2_end && r1_end >= r2_start) || (r2_start <= r1_end && r2_end >= r1_start); } #ifdef CONFIG_PCI_BIOS /* * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS * based config access (CONFIG_PCI_GOBIOS) support. */ #define BIOS_PFN PFN_DOWN(BIOS_BEGIN) #define BIOS_PFN_END PFN_DOWN(BIOS_END - 1) static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END)) return _PAGE_NX; return 0; } #else static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn) { return 0; } #endif /* * The .rodata section needs to be read-only. Using the pfn catches all * aliases. This also includes __ro_after_init, so do not enforce until * kernel_set_to_readonly is true. */ static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn) { unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata)); /* * Note: __end_rodata is at page aligned and not inclusive, so * subtract 1 to get the last enforced PFN in the rodata area. */ epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1; if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro)) return _PAGE_RW; return 0; } /* * Protect kernel text against becoming non executable by forbidding * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext) * out of which the kernel actually executes. Do not protect the low * mapping. * * This does not cover __inittext since that is gone after boot. */ static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)_etext - 1; unsigned long t_start = (unsigned long)_text; if (overlaps(start, end, t_start, t_end)) return _PAGE_NX; return 0; } #if defined(CONFIG_X86_64) /* * Once the kernel maps the text as RO (kernel_set_to_readonly is set), * kernel text mappings for the large page aligned text, rodata sections * will be always read-only. For the kernel identity mappings covering the * holes caused by this alignment can be anything that user asks. * * This will preserve the large page mappings for kernel text/data at no * extra cost. */ static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1; unsigned long t_start = (unsigned long)_text; unsigned int level; if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end)) return 0; /* * Don't enforce the !RW mapping for the kernel text mapping, if * the current mapping is already using small page mapping. No * need to work hard to preserve large page mappings in this case. * * This also fixes the Linux Xen paravirt guest boot failure caused * by unexpected read-only mappings for kernel identity * mappings. In this paravirt guest case, the kernel text mapping * and the kernel identity mapping share the same page-table pages, * so the protections for kernel text and identity mappings have to * be the same. */ if (lookup_address(start, &level) && (level != PG_LEVEL_4K)) return _PAGE_RW; return 0; } #else static pgprotval_t protect_kernel_text_ro(unsigned long start, unsigned long end) { return 0; } #endif static inline bool conflicts(pgprot_t prot, pgprotval_t val) { return (pgprot_val(prot) & ~val) != pgprot_val(prot); } static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val, unsigned long start, unsigned long end, unsigned long pfn, const char *txt) { static const char *lvltxt[] = { [CPA_CONFLICT] = "conflict", [CPA_PROTECT] = "protect", [CPA_DETECT] = "detect", }; if (warnlvl > cpa_warn_level || !conflicts(prot, val)) return; pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n", lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot), (unsigned long long)val); } /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, unsigned long pfn, unsigned long npg, unsigned long lpsize, int warnlvl) { pgprotval_t forbidden, res; unsigned long end; /* * There is no point in checking RW/NX conflicts when the requested * mapping is setting the page !PRESENT. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) return prot; /* Operate on the virtual address */ end = start + npg * PAGE_SIZE - 1; res = protect_kernel_text(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX"); forbidden = res; /* * Special case to preserve a large page. If the change spawns the * full large page mapping then there is no point to split it * up. Happens with ftrace and is going to be removed once ftrace * switched to text_poke(). */ if (lpsize != (npg * PAGE_SIZE) || (start & (lpsize - 1))) { res = protect_kernel_text_ro(start, end); check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO"); forbidden |= res; } /* Check the PFN directly */ res = protect_pci_bios(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX"); forbidden |= res; res = protect_rodata(pfn, pfn + npg - 1); check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO"); forbidden |= res; return __pgprot(pgprot_val(prot) & ~forbidden); } /* * Validate strict W^X semantics. */ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, unsigned long pfn, unsigned long npg, bool nx, bool rw) { unsigned long end; /* * 32-bit has some unfixable W+X issues, like EFI code * and writeable data being in the same page. Disable * detection and enforcement there. */ if (IS_ENABLED(CONFIG_X86_32)) return new; /* Only verify when NX is supported: */ if (!(__supported_pte_mask & _PAGE_NX)) return new; if (!((pgprot_val(old) ^ pgprot_val(new)) & (_PAGE_RW | _PAGE_NX))) return new; if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) return new; /* Non-leaf translation entries can disable writing or execution. */ if (!rw || nx) return new; end = start + npg * PAGE_SIZE - 1; WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", (unsigned long long)pgprot_val(old), (unsigned long long)pgprot_val(new), start, end, pfn); /* * For now, allow all permission change attempts by returning the * attempted permissions. This can 'return old' to actively * refuse the permission change at a later time. */ return new; } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry (or NULL if the entry does not exist), * the level of the entry, and the effective NX and RW bits of all * page table levels. */ pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, unsigned int *level, bool *nx, bool *rw) { p4d_t *p4d; pud_t *pud; pmd_t *pmd; *level = PG_LEVEL_256T; *nx = false; *rw = true; if (pgd_none(*pgd)) return NULL; *level = PG_LEVEL_512G; *nx |= pgd_flags(*pgd) & _PAGE_NX; *rw &= pgd_flags(*pgd) & _PAGE_RW; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return NULL; if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; *level = PG_LEVEL_1G; *nx |= p4d_flags(*p4d) & _PAGE_NX; *rw &= p4d_flags(*p4d) & _PAGE_RW; pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; *level = PG_LEVEL_2M; *nx |= pud_flags(*pud) & _PAGE_NX; *rw &= pud_flags(*pud) & _PAGE_RW; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; *nx |= pmd_flags(*pmd) & _PAGE_NX; *rw &= pmd_flags(*pmd) & _PAGE_RW; return pte_offset_kernel(pmd, address); } /* * Lookup the page table entry for a virtual address in a specific pgd. * Return a pointer to the entry and the level of the mapping. */ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { bool nx, rw; return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw); } /* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * * Note: the function returns p4d, pud or pmd either when the entry is marked * large or when the present bit is not set. Otherwise it returns NULL. */ pte_t *lookup_address(unsigned long address, unsigned int *level) { return lookup_address_in_pgd(pgd_offset_k(address), address, level); } EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, unsigned int *level, bool *nx, bool *rw) { pgd_t *pgd; if (!cpa->pgd) pgd = pgd_offset_k(address); else pgd = cpa->pgd + pgd_index(address); return lookup_address_in_pgd_attr(pgd, address, level, nx, rw); } /* * Lookup the PMD entry for a virtual address. Return a pointer to the entry * or NULL if not present. */ pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; p4d = p4d_offset(pgd, address); if (p4d_none(*p4d) || p4d_leaf(*p4d) || !p4d_present(*p4d)) return NULL; pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); } /* * This is necessary because __pa() does not work on some * kinds of memory, like vmalloc() or the alloc_remap() * areas on 32-bit NUMA systems. The percpu areas can * end up in this kind of memory, for instance. * * Note that as long as the PTEs are well-formed with correct PFNs, this * works without checking the PRESENT bit in the leaf PTE. This is unlike * the similar vmalloc_to_page() and derivatives. Callers may depend on * this behavior. * * This could be optimized, but it is only used in paths that are not perf * sensitive, and keeping it unoptimized should increase the testing coverage * for the more obscure platforms. */ phys_addr_t slow_virt_to_phys(void *__virt_addr) { unsigned long virt_addr = (unsigned long)__virt_addr; phys_addr_t phys_addr; unsigned long offset; enum pg_level level; pte_t *pte; pte = lookup_address(virt_addr, &level); BUG_ON(!pte); /* * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t * before being left-shifted PAGE_SHIFT bits -- this trick is to * make 32-PAE kernel work correctly. */ switch (level) { case PG_LEVEL_1G: phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PUD_MASK; break; case PG_LEVEL_2M: phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT; offset = virt_addr & ~PMD_MASK; break; default: phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; offset = virt_addr & ~PAGE_MASK; } return (phys_addr_t)(phys_addr | offset); } EXPORT_SYMBOL_GPL(slow_virt_to_phys); /* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 if (!SHARED_KERNEL_PMD) { struct page *page; list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); p4d = p4d_offset(pgd, address); pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } } #endif } static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot) { /* * _PAGE_GLOBAL means "global page" for present PTEs. * But, it is also used to indicate _PAGE_PROTNONE * for non-present PTEs. * * This ensures that a _PAGE_GLOBAL PTE going from * present to non-present is not confused as * _PAGE_PROTNONE. */ if (!(pgprot_val(prot) & _PAGE_PRESENT)) pgprot_val(prot) &= ~_PAGE_GLOBAL; return prot; } static int __should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, *tmp; enum pg_level level; bool nx, rw; /* * Check for races, another CPU might have split this page * up already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) return 1; switch (level) { case PG_LEVEL_2M: old_prot = pmd_pgprot(*(pmd_t *)kpte); old_pfn = pmd_pfn(*(pmd_t *)kpte); cpa_inc_2m_checked(); break; case PG_LEVEL_1G: old_prot = pud_pgprot(*(pud_t *)kpte); old_pfn = pud_pfn(*(pud_t *)kpte); cpa_inc_1g_checked(); break; default: return -EINVAL; } psize = page_level_size(level); pmask = page_level_mask(level); /* * Calculate the number of pages, which fit into this large * page starting at address: */ lpaddr = (address + psize) & pmask; numpages = (lpaddr - address) >> PAGE_SHIFT; if (numpages < cpa->numpages) cpa->numpages = numpages; /* * We are safe now. Check whether the new pgprot is the same: * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot); pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); /* * req_prot is in format of 4k pages. It must be converted to large * page format: the caching mode includes the PAT bit located at * different bit positions in the two formats. */ req_prot = pgprot_4k_2_large(req_prot); req_prot = pgprot_clear_protnone_bits(req_prot); if (pgprot_val(req_prot) & _PAGE_PRESENT) pgprot_val(req_prot) |= _PAGE_PSE; /* * old_pfn points to the large page base pfn. So we need to add the * offset of the virtual address: */ pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); cpa->pfn = pfn; /* * Calculate the large page base address and the number of 4K pages * in the large page */ lpaddr = address & pmask; numpages = psize >> PAGE_SHIFT; /* * Sanity check that the existing mapping is correct versus the static * protections. static_protections() guards against !PRESENT, so no * extra conditional required here. */ chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages, psize, CPA_CONFLICT); if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) { /* * Split the large page and tell the split code to * enforce static protections. */ cpa->force_static_prot = 1; return 1; } /* * Optimization: If the requested pgprot is the same as the current * pgprot, then the large page can be preserved and no updates are * required independent of alignment and length of the requested * range. The above already established that the current pgprot is * correct, which in consequence makes the requested pgprot correct * as well if it is the same. The static protection scan below will * not come to a different conclusion. */ if (pgprot_val(req_prot) == pgprot_val(old_prot)) { cpa_inc_lp_sameprot(level); return 0; } /* * If the requested range does not cover the full page, split it up */ if (address != lpaddr || cpa->numpages != numpages) return 1; /* * Check whether the requested pgprot is conflicting with a static * protection requirement in the large page. */ new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, psize, CPA_DETECT); new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages, nx, rw); /* * If there is a conflict, split the large page. * * There used to be a 4k wise evaluation trying really hard to * preserve the large pages, but experimentation has shown, that this * does not help at all. There might be corner cases which would * preserve one large page occasionally, but it's really not worth the * extra code and cycles for the common case. */ if (pgprot_val(req_prot) != pgprot_val(new_prot)) return 1; /* All checks passed. Update the large page mapping. */ new_pte = pfn_pte(old_pfn, new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; cpa_inc_lp_preserved(level); return 0; } static int should_split_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { int do_split; if (cpa->force_split) return 1; spin_lock(&pgd_lock); do_split = __should_split_large_page(kpte, address, cpa); spin_unlock(&pgd_lock); return do_split; } static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn, pgprot_t ref_prot, unsigned long address, unsigned long size) { unsigned int npg = PFN_DOWN(size); pgprot_t prot; /* * If should_split_large_page() discovered an inconsistent mapping, * remove the invalid protection in the split mapping. */ if (!cpa->force_static_prot) goto set; /* Hand in lpsize = 0 to enforce the protection mechanism */ prot = static_protections(ref_prot, address, pfn, npg, 0, CPA_PROTECT); if (pgprot_val(prot) == pgprot_val(ref_prot)) goto set; /* * If this is splitting a PMD, fix it up. PUD splits cannot be * fixed trivially as that would require to rescan the newly * installed PMD mappings after returning from split_large_page() * so an eventual further split can allocate the necessary PTE * pages. Warn for now and revisit it in case this actually * happens. */ if (size == PAGE_SIZE) ref_prot = prot; else pr_warn_once("CPA: Cannot fixup static protections for PUD split\n"); set: set_pte(pte, pfn_pte(pfn, ref_prot)); } static int __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, struct page *base) { unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1; pte_t *pbase = (pte_t *)page_address(base); unsigned int i, level; pgprot_t ref_prot; bool nx, rw; pte_t *tmp; spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: */ tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; } paravirt_alloc_pte(&init_mm, page_to_pfn(base)); switch (level) { case PG_LEVEL_2M: ref_prot = pmd_pgprot(*(pmd_t *)kpte); /* * Clear PSE (aka _PAGE_PAT) and move * PAT bit to correct position. */ ref_prot = pgprot_large_2_4k(ref_prot); ref_pfn = pmd_pfn(*(pmd_t *)kpte); lpaddr = address & PMD_MASK; lpinc = PAGE_SIZE; break; case PG_LEVEL_1G: ref_prot = pud_pgprot(*(pud_t *)kpte); ref_pfn = pud_pfn(*(pud_t *)kpte); pfninc = PMD_SIZE >> PAGE_SHIFT; lpaddr = address & PUD_MASK; lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set * otherwise pmd_present() will return true even on a non * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; break; default: spin_unlock(&pgd_lock); return 1; } ref_prot = pgprot_clear_protnone_bits(ref_prot); /* * Get the target pfn from the original entry: */ pfn = ref_pfn; for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc) split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc); if (virt_addr_valid(address)) { unsigned long pfn = PFN_DOWN(__pa(address)); if (pfn_range_is_mapped(pfn, pfn + 1)) split_page_count(level); } /* * Install the new, split up pagetable. * * We use the standard kernel pagetable protections for the new * pagetable protections, the actual ptes set above control the * primary protection behavior: */ __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* * Do a global flush tlb after splitting the large page * and before we do the actual change page attribute in the PTE. * * Without this, we violate the TLB application note, that says: * "The TLBs may contain both ordinary and large-page * translations for a 4-KByte range of linear addresses. This * may occur if software modifies the paging structures so that * the page size used for the address range changes. If the two * translations differ with respect to page frame or attributes * (e.g., permissions), processor behavior is undefined and may * be implementation-specific." * * We do this global tlb flush inside the cpa_lock, so that we * don't allow any other cpu, with stale tlb entries change the * page attribute in parallel, that also falls into the * just split large page entry. */ flush_tlb_all(); spin_unlock(&pgd_lock); return 0; } static int split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address) { struct page *base; if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) return -ENOMEM; if (__split_large_page(cpa, kpte, address, base)) __free_page(base); return 0; } static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, struct list_head *pgtables) { pmd_t _pmd, old_pmd; pte_t *pte, first; unsigned long pfn; pgprot_t pgprot; int i = 0; addr &= PMD_MASK; pte = pte_offset_kernel(pmd, addr); first = *pte; pfn = pte_pfn(first); /* Make sure alignment is suitable */ if (PFN_PHYS(pfn) & ~PMD_MASK) return 0; /* The page is 4k intentionally */ if (pte_flags(first) & _PAGE_KERNEL_4K) return 0; /* Check that the rest of PTEs are compatible with the first one */ for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { pte_t entry = *pte; if (!pte_present(entry)) return 0; if (pte_flags(entry) != pte_flags(first)) return 0; if (pte_pfn(entry) != pte_pfn(first) + i) return 0; } old_pmd = *pmd; /* Success: set up a large page */ pgprot = pgprot_4k_2_large(pte_pgprot(first)); pgprot_val(pgprot) |= _PAGE_PSE; _pmd = pfn_pmd(pfn, pgprot); set_pmd(pmd, _pmd); /* Queue the page table to be freed after TLB flush */ list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { struct page *page; /* Update all PGD tables to use the same large page */ list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); /* Something is wrong if entries doesn't match */ if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) continue; set_pmd(pmd, _pmd); } } if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) collapse_page_count(PG_LEVEL_2M); return 1; } static int collapse_pud_page(pud_t *pud, unsigned long addr, struct list_head *pgtables) { unsigned long pfn; pmd_t *pmd, first; int i; if (!direct_gbpages) return 0; addr &= PUD_MASK; pmd = pmd_offset(pud, addr); first = *pmd; /* * To restore PUD page all PMD entries must be large and * have suitable alignment */ pfn = pmd_pfn(first); if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) return 0; /* * To restore PUD page, all following PMDs must be compatible with the * first one. */ for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { pmd_t entry = *pmd; if (!pmd_present(entry) || !pmd_leaf(entry)) return 0; if (pmd_flags(entry) != pmd_flags(first)) return 0; if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) return 0; } /* Restore PUD page and queue page table to be freed after TLB flush */ list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) collapse_page_count(PG_LEVEL_1G); return 1; } /* * Collapse PMD and PUD pages in the kernel mapping around the address where * possible. * * Caller must flush TLB and free page tables queued on the list before * touching the new entries. CPU must not see TLB entries of different size * with different attributes. */ static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) { int collapsed = 0; pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; addr &= PMD_MASK; spin_lock(&pgd_lock); pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) goto out; p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) goto out; pud = pud_offset(p4d, addr); if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; collapsed = collapse_pmd_page(pmd, addr, pgtables); if (collapsed) collapsed += collapse_pud_page(pud, addr, pgtables); out: spin_unlock(&pgd_lock); return collapsed; } static bool try_to_free_pte_page(pte_t *pte) { int i; for (i = 0; i < PTRS_PER_PTE; i++) if (!pte_none(pte[i])) return false; free_page((unsigned long)pte); return true; } static bool try_to_free_pmd_page(pmd_t *pmd) { int i; for (i = 0; i < PTRS_PER_PMD; i++) if (!pmd_none(pmd[i])) return false; free_page((unsigned long)pmd); return true; } static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); while (start < end) { set_pte(pte, __pte(0)); start += PAGE_SIZE; pte++; } if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { pmd_clear(pmd); return true; } return false; } static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, unsigned long start, unsigned long end) { if (unmap_pte_range(pmd, start, end)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) { pmd_t *pmd = pmd_offset(pud, start); /* * Not on a 2MB page boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); __unmap_pmd_range(pud, pmd, start, pre_end); start = pre_end; pmd++; } /* * Try to unmap in 2M chunks. */ while (end - start >= PMD_SIZE) { if (pmd_leaf(*pmd)) pmd_clear(pmd); else __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); start += PMD_SIZE; pmd++; } /* * 4K leftovers? */ if (start < end) return __unmap_pmd_range(pud, pmd, start, end); /* * Try again to free the PMD page if haven't succeeded above. */ if (!pud_none(*pud)) if (try_to_free_pmd_page(pud_pgtable(*pud))) pud_clear(pud); } static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? */ if (start & (PUD_SIZE - 1)) { unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; unsigned long pre_end = min_t(unsigned long, end, next_page); unmap_pmd_range(pud, start, pre_end); start = pre_end; pud++; } /* * Try to unmap in 1G chunks? */ while (end - start >= PUD_SIZE) { if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); start += PUD_SIZE; pud++; } /* * 2M leftovers? */ if (start < end) unmap_pmd_range(pud, start, end); /* * No need to try to free the PUD page because we'll free it in * populate_pgd's error path */ } static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); return 0; } static int alloc_pmd_page(pud_t *pud) { pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); return 0; } static void populate_pte(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) { pte_t *pte; pte = pte_offset_kernel(pmd, start); pgprot = pgprot_clear_protnone_bits(pgprot); while (num_pages-- && start < end) { set_pte(pte, pfn_pte(cpa->pfn, pgprot)); start += PAGE_SIZE; cpa->pfn++; pte++; } } static long populate_pmd(struct cpa_data *cpa, unsigned long start, unsigned long end, unsigned num_pages, pud_t *pud, pgprot_t pgprot) { long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; /* * Not on a 2M boundary? */ if (start & (PMD_SIZE - 1)) { unsigned long pre_end = start + (num_pages << PAGE_SHIFT); unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; pre_end = min_t(unsigned long, pre_end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(unsigned int, num_pages, cur_pages); /* * Need a PTE page? */ pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); start = pre_end; } /* * We mapped them all? */ if (num_pages == cur_pages) return cur_pages; pmd_pgprot = pgprot_4k_2_large(pgprot); while (end - start >= PMD_SIZE) { /* * We cannot use a 1G page so allocate a PMD page if needed. */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; pmd = pmd_offset(pud, start); set_pmd(pmd, pmd_mkhuge(pfn_pmd(cpa->pfn, canon_pgprot(pmd_pgprot)))); start += PMD_SIZE; cpa->pfn += PMD_SIZE >> PAGE_SHIFT; cur_pages += PMD_SIZE >> PAGE_SHIFT; } /* * Map trailing 4K pages. */ if (start < end) { pmd = pmd_offset(pud, start); if (pmd_none(*pmd)) if (alloc_pte_page(pmd)) return -1; populate_pte(cpa, start, end, num_pages - cur_pages, pmd, pgprot); } return num_pages; } static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, pgprot_t pgprot) { pud_t *pud; unsigned long end; long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); /* * Not on a Gb page boundary? => map everything up to it with * smaller pages. */ if (start & (PUD_SIZE - 1)) { unsigned long pre_end; unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; pre_end = min_t(unsigned long, end, next_page); cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); pud = pud_offset(p4d, start); /* * Need a PMD page? */ if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, pud, pgprot); if (cur_pages < 0) return cur_pages; start = pre_end; } /* We mapped them all? */ if (cpa->numpages == cur_pages) return cur_pages; pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* * Map everything starting from the Gb boundary, possibly with 1G pages */ while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) { set_pud(pud, pud_mkhuge(pfn_pud(cpa->pfn, canon_pgprot(pud_pgprot)))); start += PUD_SIZE; cpa->pfn += PUD_SIZE >> PAGE_SHIFT; cur_pages += PUD_SIZE >> PAGE_SHIFT; pud++; } /* Map trailing leftover */ if (start < end) { long tmp; pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, pud, pgprot); if (tmp < 0) return cur_pages; cur_pages += tmp; } return cur_pages; } /* * Restrictions for kernel page table do not necessarily apply when mapping in * an alternate PGD. */ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); } /* * Allocate a PUD page and hand it down for mapping. */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } cpa->numpages = ret; return 0; } static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, int primary) { if (cpa->pgd) { /* * Right now, we only execute this code path when mapping * the EFI virtual memory map regions, no other users * provide a ->pgd value. This may change in the future. */ return populate_pgd(cpa, vaddr); } /* * Ignore all non primary paths. */ if (!primary) { cpa->numpages = 1; return 0; } /* * Ignore the NULL PTE for kernel identity mapping, as it is expected * to have holes. * Also set numpages to '1' indicating that we processed cpa req for * one virtual address page and its pfn. TBD: numpages can be set based * on the initial value and the level returned by lookup_address(). */ if (within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { cpa->numpages = 1; cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; return 0; } else if (__cpa_pfn_in_highmap(cpa->pfn)) { /* Faults in the highmap are OK, so do not warn: */ return -EFAULT; } else { WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", vaddr, *cpa->vaddr); return -EFAULT; } } static int __change_page_attr(struct cpa_data *cpa, int primary) { unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; bool nx, rw; address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (!kpte) return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { pte_t new_pte; pgprot_t old_prot = pte_pgprot(old_pte); pgprot_t new_prot = pte_pgprot(old_pte); unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); cpa_inc_4k_install(); /* Hand in lpsize = 0 to enforce the protection mechanism */ new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1, nx, rw); new_prot = pgprot_clear_protnone_bits(new_prot); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change its attributes * not the memory it points to */ new_pte = pfn_pte(pfn, new_prot); cpa->pfn = pfn; /* * Do we really change anything ? */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; } /* * Check, whether we can keep the large page intact * and just change the pte: */ do_split = should_split_large_page(kpte, address, cpa); /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ if (do_split <= 0) return do_split; /* * We have to split the large page: */ err = split_large_page(cpa, kpte, address); if (!err) goto repeat; return err; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary); /* * Check the directmap and "high kernel map" 'aliases'. */ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); unsigned long vaddr; int ret; if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) return 0; /* * No need to redo, when the primary call touched the direct * mapping already: */ vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { alias_cpa = *cpa; alias_cpa.vaddr = &laddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* Directmap always has NX set, do not modify. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; ret = __change_page_attr_set_clr(&alias_cpa, 0); if (ret) return ret; } #ifdef CONFIG_X86_64 /* * If the primary call didn't touch the high mapping already * and the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ if (!within(vaddr, (unsigned long)_text, _brk_end) && __cpa_pfn_in_highmap(cpa->pfn)) { unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; alias_cpa = *cpa; alias_cpa.vaddr = &temp_cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); alias_cpa.curpage = 0; /* * [_text, _brk_end) also covers data, do not modify NX except * in cases where the highmap is the primary target. */ if (__supported_pte_mask & _PAGE_NX) { alias_cpa.mask_clr.pgprot &= ~_PAGE_NX; alias_cpa.mask_set.pgprot &= ~_PAGE_NX; } cpa->force_flush_all = 1; /* * The high mapping range is imprecise, so ignore the * return value. */ __change_page_attr_set_clr(&alias_cpa, 0); } #endif return 0; } static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary) { unsigned long numpages = cpa->numpages; unsigned long rempages = numpages; int ret = 0; /* * No changes, easy! */ if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) && !cpa->force_split) return ret; while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); ret = __change_page_attr(cpa, primary); if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); if (ret) goto out; if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) { ret = cpa_process_alias(cpa); if (ret) goto out; } /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been * preserved or a single page update happened. */ BUG_ON(cpa->numpages > rempages || !cpa->numpages); rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } out: /* Restore the original numpages */ cpa->numpages = numpages; return ret; } static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag, struct page **pages) { struct cpa_data cpa; int ret, cache; memset(&cpa, 0, sizeof(cpa)); /* * Check, if we are requested to set a not supported * feature. Clearing non-supported features is OK. */ mask_set = canon_pgprot(mask_set); if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ if (in_flag & CPA_ARRAY) { int i; for (i = 0; i < numpages; i++) { if (addr[i] & ~PAGE_MASK) { addr[i] &= PAGE_MASK; WARN_ON_ONCE(1); } } } else if (!(in_flag & CPA_PAGES_ARRAY)) { /* * in_flag of CPA_PAGES_ARRAY implies it is aligned. * No need to check in that case */ if (*addr & ~PAGE_MASK) { *addr &= PAGE_MASK; /* * People should not be passing in unaligned addresses: */ WARN_ON_ONCE(1); } } /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); cpa.vaddr = addr; cpa.pages = pages; cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flags = in_flag; cpa.curpage = 0; cpa.force_split = force_split; ret = __change_page_attr_set_clr(&cpa, 1); /* * Check whether we really changed something: */ if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* * No need to flush, when we did not set any of the caching * attributes: */ cache = !!pgprot2cachemode(mask_set); /* * On error; flush everything to be sure. */ if (ret) { cpa_flush_all(cache); goto out; } cpa_flush(&cpa, cache); out: return ret; } static inline int change_page_attr_set(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, (array ? CPA_ARRAY : 0), NULL); } static inline int change_page_attr_clear(unsigned long *addr, int numpages, pgprot_t mask, int array) { return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, (array ? CPA_ARRAY : 0), NULL); } static inline int cpa_set_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, CPA_PAGES_ARRAY, pages); } static inline int cpa_clear_pages_array(struct page **pages, int numpages, pgprot_t mask) { return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, CPA_PAGES_ARRAY, pages); } int _set_memory_uc(unsigned long addr, int numpages) { /* * for now UC MINUS. see comments in ioremap() * If you really need strong UC use ioremap_uc(), but note * that you cannot override IO areas with set_memory_*() as * these helpers cannot work with IO memory. */ return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) { int ret; /* * for now UC MINUS. see comments in ioremap() */ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_UC_MINUS, NULL); if (ret) goto out_err; ret = _set_memory_uc(addr, numpages); if (ret) goto out_free; return 0; out_free: memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); out_err: return ret; } EXPORT_SYMBOL(set_memory_uc); int _set_memory_wc(unsigned long addr, int numpages) { int ret; ret = change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { ret = change_page_attr_set_clr(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } return ret; } int set_memory_wc(unsigned long addr, int numpages) { int ret; ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_MODE_WC, NULL); if (ret) return ret; ret = _set_memory_wc(addr, numpages); if (ret) memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return ret; } EXPORT_SYMBOL(set_memory_wc); int _set_memory_wt(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0); } int _set_memory_wb(unsigned long addr, int numpages) { /* WB cache mode is hard wired to all cache attribute bits being 0 */ return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) { int ret; ret = _set_memory_wb(addr, numpages); if (ret) return ret; memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return 0; } EXPORT_SYMBOL(set_memory_wb); /* Prevent speculative access to a page by marking it not-present */ #ifdef CONFIG_X86_64 int set_mce_nospec(unsigned long pfn) { unsigned long decoy_addr; int rc; /* SGX pages are not in the 1:1 map */ if (arch_is_platform_page(pfn << PAGE_SHIFT)) return 0; /* * We would like to just call: * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); * but doing that would radically increase the odds of a * speculative access to the poison page because we'd have * the virtual address of the kernel 1:1 mapping sitting * around in registers. * Instead we get tricky. We create a non-canonical address * that looks just like the one we want, but has bit 63 flipped. * This relies on set_memory_XX() properly sanitizing any __pa() * results with __PHYSICAL_MASK or PTE_PFN_MASK. */ decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); rc = set_memory_np(decoy_addr, 1); if (rc) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } EXPORT_SYMBOL_GPL(set_mce_nospec); /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) { unsigned long addr = (unsigned long) pfn_to_kaddr(pfn); return set_memory_p(addr, 1); } EXPORT_SYMBOL_GPL(clear_mce_nospec); #endif /* CONFIG_X86_64 */ int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_nx(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) return 0; return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | _PAGE_DIRTY), 0); } int set_memory_rox(unsigned long addr, int numpages) { pgprot_t clr = __pgprot(_PAGE_RW | _PAGE_DIRTY); if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, CPA_COLLAPSE, NULL); } int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_np_noalias(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(0), __pgprot(_PAGE_PRESENT), 0, CPA_NO_CHECK_ALIAS, NULL); } int set_memory_p(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { return change_page_attr_set_clr(&addr, numpages, __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL); } int set_memory_nonglobal(unsigned long addr, int numpages) { return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } int set_memory_global(unsigned long addr, int numpages) { return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_GLOBAL), 0); } /* * __set_memory_enc_pgtable() is used for the hypervisors that get * informed about "encryption" status via page tables. */ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) { pgprot_t empty = __pgprot(0); struct cpa_data cpa; int ret; /* Should not be working on unaligned addresses */ if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; memset(&cpa, 0, sizeof(cpa)); cpa.vaddr = &addr; cpa.numpages = numpages; cpa.mask_set = enc ? pgprot_encrypted(empty) : pgprot_decrypted(empty); cpa.mask_clr = enc ? pgprot_decrypted(empty) : pgprot_encrypted(empty); cpa.pgd = init_mm.pgd; /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); vm_unmap_aliases(); /* Flush the caches as needed before changing the encryption attribute. */ if (x86_platform.guest.enc_tlb_flush_required(enc)) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); if (ret) goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); /* * After changing the encryption attribute, we need to flush TLBs again * in case any speculative TLB caching occurred (but no need to flush * caches again). We could just use cpa_flush_all(), but in case TLB * flushing gets optimized in the cpa_flush() path use the same logic * as above. */ cpa_flush(&cpa, 0); if (ret) return ret; /* Notify hypervisor that we have successfully set/clr encryption attribute. */ ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc); if (ret) goto vmm_fail; return 0; vmm_fail: WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n", (void *)addr, numpages, enc ? "private" : "shared", ret); return ret; } /* * The lock serializes conversions between private and shared memory. * * It is taken for read on conversion. A write lock guarantees that no * concurrent conversions are in progress. */ static DECLARE_RWSEM(mem_enc_lock); /* * Stop new private<->shared conversions. * * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete. * The lock is not released to prevent new conversions from being started. */ bool set_memory_enc_stop_conversion(void) { /* * In a crash scenario, sleep is not allowed. Try to take the lock. * Failure indicates that there is a race with the conversion. */ if (oops_in_progress) return down_write_trylock(&mem_enc_lock); down_write(&mem_enc_lock); return true; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { int ret = 0; if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { if (!down_read_trylock(&mem_enc_lock)) return -EBUSY; ret = __set_memory_enc_pgtable(addr, numpages, enc); up_read(&mem_enc_lock); } return ret; } int set_memory_encrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, true); } EXPORT_SYMBOL_GPL(set_memory_encrypted); int set_memory_decrypted(unsigned long addr, int numpages) { return __set_memory_enc_dec(addr, numpages, false); } EXPORT_SYMBOL_GPL(set_memory_decrypted); int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_pages_uc); static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; unsigned long end; enum page_cache_mode set_type; int i; int free_idx; int ret; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; if (memtype_reserve(start, end, new_type, NULL)) goto err_out; } /* If WC, set to UC- first and then WC */ set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; ret = cpa_set_pages_array(pages, numpages, cachemode2pgprot(set_type)); if (!ret && new_type == _PAGE_CACHE_MODE_WC) ret = change_page_attr_set_clr(NULL, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, CPA_PAGES_ARRAY, pages); if (ret) goto err_out; return 0; /* Success */ err_out: free_idx = i; for (i = 0; i < free_idx; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return -EINVAL; } int set_pages_array_uc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_pages_array_uc); int set_pages_array_wc(struct page **pages, int numpages) { return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_pages_array_wc); int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_pages_wb); int set_pages_array_wb(struct page **pages, int numpages) { int retval; unsigned long start; unsigned long end; int i; /* WB cache mode is hard wired to all cache attribute bits being 0 */ retval = cpa_clear_pages_array(pages, numpages, __pgprot(_PAGE_CACHE_MASK)); if (retval) return retval; for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; memtype_free(start, end); } return 0; } EXPORT_SYMBOL(set_pages_array_wb); int set_pages_ro(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_ro(addr, numpages); } int set_pages_rw(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); return set_memory_rw(addr, numpages); } static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); struct cpa_data cpa = { .vaddr = &tempaddr, .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* * No alias checking needed for setting not present flag. otherwise, * we may need to break large pages for 64-bit kernel text * mappings (this adds to complexity if we want to do this from * atomic context especially). Let's keep it simple! */ return __change_page_attr_set_clr(&cpa, 1); } int set_direct_map_invalid_noflush(struct page *page) { return __set_pages_np(page, 1); } int set_direct_map_default_noflush(struct page *page) { return __set_pages_p(page, 1); } int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) { if (valid) return __set_pages_p(page, nr); return __set_pages_np(page, nr); } #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) return; if (!enable) { debug_check_no_locks_freed(page_address(page), numpages * PAGE_SIZE); } /* * The return value is ignored as the calls cannot fail. * Large pages for identity mappings are not used at boot time * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); else __set_pages_np(page, numpages); /* * We should perform an IPI and flush all tlbs, * but that can deadlock->flush only current cpu. * Preemption needs to be disabled around __flush_tlb_all() due to * CR3 reload in __native_flush_tlb(). */ preempt_disable(); __flush_tlb_all(); preempt_enable(); arch_flush_lazy_mmu_mode(); } #endif /* CONFIG_DEBUG_PAGEALLOC */ bool kernel_page_present(struct page *page) { unsigned int level; pte_t *pte; if (PageHighMem(page)) return false; pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { int retval = -EINVAL; struct cpa_data cpa = { .vaddr = &address, .pfn = pfn, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); if (!(__supported_pte_mask & _PAGE_NX)) goto out; if (!(page_flags & _PAGE_ENC)) cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); out: return retval; } /* * __flush_tlb_all() flushes mappings only on current CPU and hence this * function shouldn't be used in an SMP environment. Presently, it's used only * during boot (way before smp_init()) by EFI subsystem and hence is ok. */ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, unsigned long numpages) { int retval; /* * The typical sequence for unmapping is to find a pte through * lookup_address_in_pgd() (ideally, it should never return NULL because * the address is already mapped) and change its protections. As pfn is * the *target* of a mapping, it's not useful while unmapping. */ struct cpa_data cpa = { .vaddr = &address, .pfn = 0, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP"); retval = __change_page_attr_set_clr(&cpa, 1); __flush_tlb_all(); return retval; } /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. */ #ifdef CONFIG_CPA_DEBUG #include "cpa-test.c" #endif
110 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 // SPDX-License-Identifier: GPL-2.0-only /* Accounting handling for netfilter. */ /* * (C) 2008 Krzysztof Piotr Oledzki <ole@ans.pl> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_acct.h> static bool nf_ct_acct __read_mostly; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); void nf_conntrack_acct_pernet_init(struct net *net) { net->ct.sysctl_acct = nf_ct_acct; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi.h - ACPI Interface * * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H #include <linux/errno.h> #include <linux/ioport.h> /* for struct resource */ #include <linux/resource_ext.h> #include <linux/device.h> #include <linux/mod_devicetable.h> #include <linux/property.h> #include <linux/uuid.h> #include <linux/node.h> struct irq_domain; struct irq_domain_ops; #ifndef _LINUX #define _LINUX #endif #include <acpi/acpi.h> #include <acpi/acpi_numa.h> #ifdef CONFIG_ACPI #include <linux/list.h> #include <linux/dynamic_debug.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/fw_table.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> #include <acpi/acpi_io.h> #include <asm/acpi.h> #ifdef CONFIG_ACPI_TABLE_LIB #define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, "ACPI") #define __init_or_acpilib #define __initdata_or_acpilib #else #define EXPORT_SYMBOL_ACPI_LIB(x) #define __init_or_acpilib __init #define __initdata_or_acpilib __initdata #endif static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return adev ? adev->handle : NULL; } #define ACPI_COMPANION(dev) to_acpi_device_node((dev)->fwnode) #define ACPI_COMPANION_SET(dev, adev) set_primary_fwnode(dev, (adev) ? \ acpi_fwnode_handle(adev) : NULL) #define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) #define ACPI_HANDLE_FWNODE(fwnode) \ acpi_device_handle(to_acpi_device_node(fwnode)) static inline struct fwnode_handle *acpi_alloc_fwnode_static(void) { struct fwnode_handle *fwnode; fwnode = kzalloc(sizeof(struct fwnode_handle), GFP_KERNEL); if (!fwnode) return NULL; fwnode_init(fwnode, &acpi_static_fwnode_ops); return fwnode; } static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode) { if (WARN_ON(!is_acpi_static_node(fwnode))) return; kfree(fwnode); } static inline bool has_acpi_companion(struct device *dev) { return is_acpi_device_node(dev->fwnode); } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false)); } static inline const char *acpi_dev_name(struct acpi_device *adev) { return dev_name(&adev->dev); } struct device *acpi_get_first_physical_node(struct acpi_device *adev); enum acpi_irq_model_id { ACPI_IRQ_MODEL_PIC = 0, ACPI_IRQ_MODEL_IOAPIC, ACPI_IRQ_MODEL_IOSAPIC, ACPI_IRQ_MODEL_PLATFORM, ACPI_IRQ_MODEL_GIC, ACPI_IRQ_MODEL_LPIC, ACPI_IRQ_MODEL_RINTC, ACPI_IRQ_MODEL_COUNT }; extern enum acpi_irq_model_id acpi_irq_model; enum acpi_interrupt_id { ACPI_INTERRUPT_PMI = 1, ACPI_INTERRUPT_INIT, ACPI_INTERRUPT_CPEI, ACPI_INTERRUPT_COUNT }; #define ACPI_SPACE_MEM 0 enum acpi_address_range_id { ACPI_ADDRESS_RANGE_MEMORY = 1, ACPI_ADDRESS_RANGE_RESERVED = 2, ACPI_ADDRESS_RANGE_ACPI = 3, ACPI_ADDRESS_RANGE_NVS = 4, ACPI_ADDRESS_RANGE_COUNT }; /* Table Handlers */ typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); /* Debugger support */ struct acpi_debugger_ops { int (*create_thread)(acpi_osd_exec_callback function, void *context); ssize_t (*write_log)(const char *msg); ssize_t (*read_cmd)(char *buffer, size_t length); int (*wait_command_ready)(bool single_step, char *buffer, size_t length); int (*notify_command_complete)(void); }; struct acpi_debugger { const struct acpi_debugger_ops *ops; struct module *owner; struct mutex lock; }; #ifdef CONFIG_ACPI_DEBUGGER int __init acpi_debugger_init(void); int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops); void acpi_unregister_debugger(const struct acpi_debugger_ops *ops); int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context); ssize_t acpi_debugger_write_log(const char *msg); ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length); int acpi_debugger_wait_command_ready(void); int acpi_debugger_notify_command_complete(void); #else static inline int acpi_debugger_init(void) { return -ENODEV; } static inline int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops) { return -ENODEV; } static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops) { } static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context) { return -ENODEV; } static inline int acpi_debugger_write_log(const char *msg) { return -ENODEV; } static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length) { return -ENODEV; } static inline int acpi_debugger_wait_command_ready(void) { return -ENODEV; } static inline int acpi_debugger_notify_command_complete(void) { return -ENODEV; } #endif #define BAD_MADT_ENTRY(entry, end) ( \ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) void __iomem *__acpi_map_table(unsigned long phys, unsigned long size); void __acpi_unmap_table(void __iomem *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); void acpi_boot_table_prepare (void); void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); int acpi_locate_initial_tables (void); void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_entries_array(char *id, unsigned long table_size, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); int acpi_table_parse_madt(enum acpi_madt_type id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_cedt(enum acpi_cedt_type id, acpi_tbl_entry_handler_arg handler_arg, void *arg); int acpi_parse_mcfg (struct acpi_table_header *header); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); #else static inline void acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } #endif void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); #if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) void acpi_arch_dma_setup(struct device *dev); #else static inline void acpi_arch_dma_setup(struct device *dev) { } #endif #ifdef CONFIG_ARM64 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); #else static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } #endif #ifdef CONFIG_RISCV void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa); #else static inline void acpi_numa_rintc_affinity_init(struct acpi_srat_rintc_affinity *pa) { } #endif #ifndef PHYS_CPUID_INVALID typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) #endif static inline bool invalid_logical_cpuid(u32 cpuid) { return (int)cpuid < 0; } static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) { return phys_id == PHYS_CPUID_INVALID; } int __init acpi_get_madt_revision(void); /* Validate the processor object's proc_id */ bool acpi_duplicate_processor_id(int proc_id); /* Processor _CTS control */ struct acpi_processor_power; #ifdef CONFIG_ACPI_PROCESSOR_CSTATE bool acpi_processor_claim_cst_control(void); int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info); #else static inline bool acpi_processor_claim_cst_control(void) { return false; } static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu); int acpi_unmap_cpu(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ acpi_handle acpi_get_processor_handle(int cpu); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base); int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base); int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base); void acpi_irq_stats_init(void); extern u32 acpi_irq_handled; extern u32 acpi_irq_not_handled; extern unsigned int acpi_sci_irq; extern bool acpi_no_s5; #define INVALID_ACPI_IRQ ((unsigned)-1) static inline bool acpi_sci_irq_valid(void) { return acpi_sci_irq != INVALID_ACPI_IRQ; } extern int sbf_port; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); void acpi_set_irq_model(enum acpi_irq_model_id model, struct fwnode_handle *(*)(u32)); void acpi_set_gsi_to_irq_fallback(u32 (*)(u32)); struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags, unsigned int size, struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data); #ifdef CONFIG_X86_IO_APIC extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); #else static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) { return -1; } #endif /* * This function undoes the effect of one call to acpi_register_gsi(). * If this matches the last registration, any IRQ resources for gsi * are freed. */ void acpi_unregister_gsi (u32 gsi); struct pci_dev; struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin); int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); #ifdef CONFIG_PCI void acpi_penalize_sci_irq(int irq, int trigger, int polarity); #else static inline void acpi_penalize_sci_irq(int irq, int trigger, int polarity) { } #endif void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); extern int ec_write(u8 addr, u8 val); extern int ec_transaction(u8 command, const u8 *wdata, unsigned wdata_len, u8 *rdata, unsigned rdata_len); extern acpi_handle ec_get_handle(void); extern bool acpi_is_pnp_device(struct acpi_device *); #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE) typedef void (*wmi_notify_handler) (union acpi_object *data, void *context); int wmi_instance_count(const char *guid); extern acpi_status wmi_evaluate_method(const char *guid, u8 instance, u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out); extern acpi_status wmi_query_block(const char *guid, u8 instance, struct acpi_buffer *out); extern acpi_status wmi_set_block(const char *guid, u8 instance, const struct acpi_buffer *in); extern acpi_status wmi_install_notify_handler(const char *guid, wmi_notify_handler handler, void *data); extern acpi_status wmi_remove_notify_handler(const char *guid); extern bool wmi_has_guid(const char *guid); extern char *wmi_get_acpi_device_uid(const char *guid); #endif /* CONFIG_ACPI_WMI */ #define ACPI_VIDEO_OUTPUT_SWITCHING 0x0001 #define ACPI_VIDEO_DEVICE_POSTING 0x0002 #define ACPI_VIDEO_ROM_AVAILABLE 0x0004 #define ACPI_VIDEO_BACKLIGHT 0x0008 #define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR 0x0010 #define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO 0x0020 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR 0x0040 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO 0x0080 #define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR 0x0100 #define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO 0x0200 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR 0x0400 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO 0x0800 extern char acpi_video_backlight_string[]; extern long acpi_is_video_device(acpi_handle handle); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_THERMAL_LIB int thermal_acpi_active_trip_temp(struct acpi_device *adev, int id, int *ret_temp); int thermal_acpi_passive_trip_temp(struct acpi_device *adev, int *ret_temp); int thermal_acpi_hot_trip_temp(struct acpi_device *adev, int *ret_temp); int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); #endif #ifdef CONFIG_ACPI_HMAT int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord); #else static inline int acpi_get_genport_coordinates(u32 uid, struct access_coordinate *coord) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); /** * pxm_to_online_node - Map proximity ID to online node * @pxm: ACPI proximity ID * * This is similar to pxm_to_node(), but always returns an online * node. When the mapped node from a given proximity ID is offline, it * looks up the node distance table and returns the nearest online node. * * ACPI device drivers, which are called after the NUMA initialization has * completed in the kernel, can call this interface to obtain their device * NUMA topology from ACPI tables. Such drivers do not have to deal with * offline nodes. A node may be offline when SRAT memory entry does not exist, * or NUMA is disabled, ex. "numa=off" on x86. */ static inline int pxm_to_online_node(int pxm) { int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } #else static inline int pxm_to_online_node(int pxm) { return 0; } static inline int acpi_map_pxm_to_node(int pxm) { return 0; } static inline int acpi_get_node(acpi_handle handle) { return 0; } #endif extern int pnpacpi_disabled; #define PXM_INVAL (-1) bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_address_space(struct acpi_resource *ares, struct resource_win *win); bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares, struct resource_win *win); unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable); unsigned int acpi_dev_get_irq_type(int triggering, int polarity); bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index, struct resource *res); void acpi_dev_free_resource_list(struct list_head *list); int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list, int (*preproc)(struct acpi_resource *, void *), void *preproc_data); int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_filter_resource_type(struct acpi_resource *ares, unsigned long types); static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares, void *arg) { return acpi_dev_filter_resource_type(ares, (unsigned long)arg); } struct acpi_device *acpi_resource_consumer(struct resource *res); int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION extern int acpi_check_s4_hw_signature; #endif #ifdef CONFIG_PM_SLEEP void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); void __init acpi_nvs_nosave_s3(void); void __init acpi_sleep_no_blacklist(void); #endif /* CONFIG_PM_SLEEP */ int acpi_register_wakeup_handler( int wake_irq, bool (*wakeup)(void *context), void *context); void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context); struct acpi_osc_context { char *uuid_str; /* UUID string */ int rev; struct acpi_buffer cap; /* list of DWORD capabilities */ struct acpi_buffer ret; /* free by caller if success */ }; acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); /* Number of _OSC capability DWORDS depends on bridge type */ #define OSC_PCI_CAPABILITY_DWORDS 3 #define OSC_CXL_CAPABILITY_DWORDS 5 /* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */ #define OSC_QUERY_DWORD 0 /* DWORD 1 */ #define OSC_SUPPORT_DWORD 1 /* DWORD 2 */ #define OSC_CONTROL_DWORD 2 /* DWORD 3 */ #define OSC_EXT_SUPPORT_DWORD 3 /* DWORD 4 */ #define OSC_EXT_CONTROL_DWORD 4 /* DWORD 5 */ /* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */ #define OSC_QUERY_ENABLE 0x00000001 /* input */ #define OSC_REQUEST_ERROR 0x00000002 /* return */ #define OSC_INVALID_UUID_ERROR 0x00000004 /* return */ #define OSC_INVALID_REVISION_ERROR 0x00000008 /* return */ #define OSC_CAPABILITIES_MASK_ERROR 0x00000010 /* return */ /* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */ #define OSC_SB_PAD_SUPPORT 0x00000001 #define OSC_SB_PPC_OST_SUPPORT 0x00000002 #define OSC_SB_PR3_SUPPORT 0x00000004 #define OSC_SB_HOTPLUG_OST_SUPPORT 0x00000008 #define OSC_SB_APEI_SUPPORT 0x00000010 #define OSC_SB_CPC_SUPPORT 0x00000020 #define OSC_SB_CPCV2_SUPPORT 0x00000040 #define OSC_SB_PCLPI_SUPPORT 0x00000080 #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT 0x00000200 #define OSC_SB_OVER_16_PSTATES_SUPPORT 0x00000400 #define OSC_SB_GED_SUPPORT 0x00000800 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 #define OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT 0x00002000 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00020000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_BATTERY_CHARGE_LIMITING_SUPPORT 0x00080000 #define OSC_SB_PRM_SUPPORT 0x00200000 #define OSC_SB_FFH_OPR_SUPPORT 0x00400000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; extern bool osc_sb_cppc2_support_acked; extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ #define OSC_USB_USB3_TUNNELING 0x00000001 #define OSC_USB_DP_TUNNELING 0x00000002 #define OSC_USB_PCIE_TUNNELING 0x00000004 #define OSC_USB_XDOMAIN 0x00000008 extern u32 osc_sb_native_usb4_control; /* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */ #define OSC_PCI_EXT_CONFIG_SUPPORT 0x00000001 #define OSC_PCI_ASPM_SUPPORT 0x00000002 #define OSC_PCI_CLOCK_PM_SUPPORT 0x00000004 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT 0x00000008 #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 #define OSC_PCI_SHPC_NATIVE_HP_CONTROL 0x00000002 #define OSC_PCI_EXPRESS_PME_CONTROL 0x00000004 #define OSC_PCI_EXPRESS_AER_CONTROL 0x00000008 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 /* CXL _OSC: Capabilities DWORD 4: Support Field */ #define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT 0x00000001 #define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT 0x00000002 #define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT 0x00000004 #define OSC_CXL_NATIVE_HP_SUPPORT 0x00000008 /* CXL _OSC: Capabilities DWORD 5: Control Field */ #define OSC_CXL_ERROR_REPORTING_CONTROL 0x00000001 static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_CONTROL_DWORD]; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_EXT_CONTROL_DWORD]; } #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 #define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006 #define ACPI_GSB_ACCESS_ATTRIB_WORD 0x00000008 #define ACPI_GSB_ACCESS_ATTRIB_BLOCK 0x0000000A #define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE 0x0000000B #define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL 0x0000000C #define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL 0x0000000D #define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES 0x0000000E #define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS 0x0000000F /* Enable _OST when all relevant hotplug operations are enabled */ #if defined(CONFIG_ACPI_HOTPLUG_CPU) && \ defined(CONFIG_ACPI_HOTPLUG_MEMORY) && \ defined(CONFIG_ACPI_CONTAINER) #define ACPI_HOTPLUG_OST #endif /* _OST Source Event Code (OSPM Action) */ #define ACPI_OST_EC_OSPM_SHUTDOWN 0x100 #define ACPI_OST_EC_OSPM_EJECT 0x103 #define ACPI_OST_EC_OSPM_INSERTION 0x200 /* _OST General Processing Status Code */ #define ACPI_OST_SC_SUCCESS 0x0 #define ACPI_OST_SC_NON_SPECIFIC_FAILURE 0x1 #define ACPI_OST_SC_UNRECOGNIZED_NOTIFY 0x2 /* _OST OS Shutdown Processing (0x100) Status Code */ #define ACPI_OST_SC_OS_SHUTDOWN_DENIED 0x80 #define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS 0x81 #define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED 0x82 #define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED 0x83 /* _OST Ejection Request (0x3, 0x103) Status Code */ #define ACPI_OST_SC_EJECT_NOT_SUPPORTED 0x80 #define ACPI_OST_SC_DEVICE_IN_USE 0x81 #define ACPI_OST_SC_DEVICE_BUSY 0x82 #define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY 0x83 #define ACPI_OST_SC_EJECT_IN_PROGRESS 0x84 /* _OST Insertion Request (0x200) Status Code */ #define ACPI_OST_SC_INSERT_IN_PROGRESS 0x80 #define ACPI_OST_SC_DRIVER_LOAD_FAILURE 0x81 #define ACPI_OST_SC_INSERT_NOT_SUPPORTED 0x82 enum acpi_predicate { all_versions, less_than_or_equal, equal, greater_than_or_equal, }; /* Table must be terminted by a NULL entry */ struct acpi_platform_list { char oem_id[ACPI_OEM_ID_SIZE+1]; char oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1]; u32 oem_revision; char *table; enum acpi_predicate pred; char *reason; u32 data; }; int acpi_match_platform_list(const struct acpi_platform_list *plat); extern void acpi_early_init(void); extern void acpi_subsystem_init(void); extern int acpi_nvs_register(__u64 start, __u64 size); extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data); const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev); const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev); const void *acpi_device_get_match_data(const struct device *dev); extern bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv); int acpi_device_uevent_modalias(const struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); struct platform_device *acpi_create_platform_device(struct acpi_device *, const struct property_entry *); #define ACPI_PTR(_ptr) (_ptr) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { adev->flags.visited = true; } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { adev->flags.visited = false; } enum acpi_reconfig_event { ACPI_RECONFIG_DEVICE_ADD = 0, ACPI_RECONFIG_DEVICE_REMOVE, }; int acpi_reconfig_notifier_register(struct notifier_block *nb); int acpi_reconfig_notifier_unregister(struct notifier_block *nb); #ifdef CONFIG_ACPI_GTDT int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); int acpi_gtdt_map_ppi(int type); bool acpi_gtdt_c3stop(int type); int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER static inline void acpi_arch_set_root_pointer(u64 addr) { } #endif #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { return 0; } #endif int acpi_get_local_u64_address(acpi_handle handle, u64 *addr); int acpi_get_local_address(acpi_handle handle, u32 *addr); const char *acpi_get_subsystem_id(acpi_handle handle); #else /* !CONFIG_ACPI */ #define acpi_disabled 1 #define ACPI_COMPANION(dev) (NULL) #define ACPI_COMPANION_SET(dev, adev) do { } while (0) #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) /* Get rid of the -Wunused-variable for adev */ #define acpi_dev_uid_match(adev, uid2) (adev && false) #define acpi_dev_hid_uid_match(adev, hid2, uid2) (adev && false) struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) { return false; } static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) { return false; } struct acpi_device; static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; } static inline struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) { return NULL; } static inline bool acpi_reduced_hardware(void) { return false; } static inline void acpi_dev_put(struct acpi_device *adev) {} static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return false; } static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return NULL; } static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return NULL; } static inline bool has_acpi_companion(struct device *dev) { return false; } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { } static inline const char *acpi_dev_name(struct acpi_device *adev) { return NULL; } static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev) { return NULL; } static inline void acpi_early_init(void) { } static inline void acpi_subsystem_init(void) { } static inline int early_acpi_boot_init(void) { return 0; } static inline int acpi_boot_init(void) { return 0; } static inline void acpi_boot_table_prepare(void) { } static inline void acpi_boot_table_init(void) { } static inline int acpi_mps_check(void) { return 0; } static inline int acpi_check_resource_conflict(struct resource *res) { return 0; } static inline int acpi_check_region(resource_size_t start, resource_size_t n, const char *name) { return 0; } struct acpi_table_header; static inline int acpi_table_parse(char *id, int (*handler)(struct acpi_table_header *)) { return -ENODEV; } static inline int acpi_nvs_register(__u64 start, __u64 size) { return 0; } static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data) { return 0; } struct acpi_device_id; static inline const struct acpi_device_id *acpi_match_acpi_device( const struct acpi_device_id *ids, const struct acpi_device *adev) { return NULL; } static inline const struct acpi_device_id *acpi_match_device( const struct acpi_device_id *ids, const struct device *dev) { return NULL; } static inline const void *acpi_device_get_match_data(const struct device *dev) { return NULL; } static inline bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { return false; } static inline bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs) { return false; } static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4) { return NULL; } static inline union acpi_object *acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { return NULL; } static inline int acpi_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { return -ENODEV; } static inline int acpi_device_modalias(struct device *dev, char *buf, int size) { return -ENODEV; } static inline struct platform_device * acpi_create_platform_device(struct acpi_device *adev, const struct property_entry *properties) { return NULL; } static inline bool acpi_dma_supported(const struct acpi_device *adev) { return false; } static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) { return DEV_DMA_NOT_SUPPORTED; } static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) { return -ENODEV; } static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return 0; } static inline int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id) { return 0; } #define ACPI_PTR(_ptr) (NULL) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { } static inline int acpi_reconfig_notifier_register(struct notifier_block *nb) { return -EINVAL; } static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb) { return -EINVAL; } static inline struct acpi_device *acpi_resource_consumer(struct resource *res) { return NULL; } static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) { return -ENODEV; } static inline const char *acpi_get_subsystem_id(acpi_handle handle) { return ERR_PTR(-ENODEV); } static inline int acpi_register_wakeup_handler(int wake_irq, bool (*wakeup)(void *context), void *context) { return -ENXIO; } static inline void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context) { } struct acpi_osc_context; static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { return 0; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { return 0; } static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } static inline acpi_handle acpi_get_processor_handle(int cpu) { return NULL; } #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, resource_size_t *size); #else static inline int hmat_get_extended_linear_cache_size(struct resource *backing_res, int nid, resource_size_t *size) { return -EOPNOTSUPP; } #endif extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_ioapic_add(acpi_handle root); #else static inline int acpi_ioapic_add(acpi_handle root) { return 0; } #endif #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, u32 val_a, u32 val_b)); acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); void (*check)(void); void (*restore)(void); }; int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); int acpi_get_lps0_constraint(struct acpi_device *adev); #else /* CONFIG_SUSPEND && CONFIG_X86 */ static inline int acpi_get_lps0_constraint(struct device *dev) { return ACPI_STATE_UNKNOWN; } #endif /* CONFIG_SUSPEND && CONFIG_X86 */ void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0) #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM) int acpi_dev_suspend(struct device *dev, bool wakeup); int acpi_dev_resume(struct device *dev); int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); bool acpi_storage_d3(struct device *dev); bool acpi_dev_state_d0(struct device *dev); #else static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } static inline int acpi_dev_pm_attach(struct device *dev, bool power_on) { return 0; } static inline bool acpi_storage_d3(struct device *dev) { return false; } static inline bool acpi_dev_state_d0(struct device *dev) { return true; } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP) int acpi_subsys_prepare(struct device *dev); void acpi_subsys_complete(struct device *dev); int acpi_subsys_suspend_late(struct device *dev); int acpi_subsys_suspend_noirq(struct device *dev); int acpi_subsys_suspend(struct device *dev); int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); int acpi_subsys_restore_early(struct device *dev); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } static inline void acpi_subsys_complete(struct device *dev) {} static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; } static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; } static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } static inline int acpi_subsys_restore_early(struct device *dev) { return 0; } #endif #if defined(CONFIG_ACPI_EC) && defined(CONFIG_PM_SLEEP) void acpi_ec_mark_gpe_for_wake(void); void acpi_ec_set_gpe_wake_mask(u8 action); #else static inline void acpi_ec_mark_gpe_for_wake(void) {} static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif #ifdef CONFIG_ACPI char *acpi_handle_path(acpi_handle handle); __printf(3, 4) void acpi_handle_printk(const char *level, acpi_handle handle, const char *fmt, ...); void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status); #else /* !CONFIG_ACPI */ static inline __printf(3, 4) void acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {} static inline void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status) {} #endif /* !CONFIG_ACPI */ #if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG) __printf(3, 4) void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...); #endif /* * acpi_handle_<level>: Print message with ACPI prefix and object path * * These interfaces acquire the global namespace mutex to obtain an object * path. In interrupt context, it shows the object path as <n/a>. */ #define acpi_handle_emerg(handle, fmt, ...) \ acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__) #define acpi_handle_alert(handle, fmt, ...) \ acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_crit(handle, fmt, ...) \ acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_err(handle, fmt, ...) \ acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__) #define acpi_handle_warn(handle, fmt, ...) \ acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__) #define acpi_handle_notice(handle, fmt, ...) \ acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__) #define acpi_handle_info(handle, fmt, ...) \ acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__) #if defined(DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__) #else #if defined(CONFIG_DYNAMIC_DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ _dynamic_func_call(fmt, __acpi_handle_debug, \ handle, pr_fmt(fmt), ##__VA_ARGS__) #else #define acpi_handle_debug(handle, fmt, ...) \ ({ \ if (0) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \ 0; \ }) #endif #endif #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB) bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index, bool *wake_capable); #else static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index, bool *wake_capable) { return -ENXIO; } #endif static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index, bool *wake_capable) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable); } static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *con_id, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, con_id, index, NULL); } static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL); } /* Device properties */ #ifdef CONFIG_ACPI int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj); int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args); static inline int acpi_node_get_property_reference( const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return __acpi_node_get_property_reference(fwnode, name, index, NR_FWNODE_REFERENCE_ARGS, args); } static inline bool acpi_dev_has_props(const struct acpi_device *adev) { return !list_empty(&adev->data.properties); } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties); int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); #define ACPI_TABLE_ID_LEN 5 /** * struct acpi_probe_entry - boot-time probing entry * @id: ACPI table name * @type: Optional subtable type to match * (if @id contains subtables) * @subtable_valid: Optional callback to check the validity of * the subtable * @probe_table: Callback to the driver being probed when table * match is successful * @probe_subtbl: Callback to the driver being probed when table and * subtable match (and optional callback is successful) * @driver_data: Sideband data provided back to the driver */ struct acpi_probe_entry { __u8 id[ACPI_TABLE_ID_LEN]; __u8 type; acpi_probe_entry_validate_subtbl subtable_valid; union { acpi_tbl_table_handler probe_table; acpi_tbl_entry_handler probe_subtbl; }; kernel_ulong_t driver_data; }; void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr); #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, \ valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_table = fn, \ .driver_data = data, \ } #define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id, \ subtable, valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_subtbl = fn, \ .driver_data = data, \ } #define ACPI_PROBE_TABLE(name) __##name##_acpi_probe_table #define ACPI_PROBE_TABLE_END(name) __##name##_acpi_probe_table_end int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr); #define acpi_probe_device_table(t) \ ({ \ extern struct acpi_probe_entry ACPI_PROBE_TABLE(t), \ ACPI_PROBE_TABLE_END(t); \ __acpi_probe_device_table(&ACPI_PROBE_TABLE(t), \ (&ACPI_PROBE_TABLE_END(t) - \ &ACPI_PROBE_TABLE(t))); \ }) #else static inline int acpi_dev_get_property(struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return -ENXIO; } static inline int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return -ENXIO; } static inline struct fwnode_handle * acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return NULL; } static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { return ERR_PTR(-ENXIO); } static inline int acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle **remote, struct fwnode_handle **port, struct fwnode_handle **endpoint) { return -ENXIO; } #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \ static const void * __acpi_table_##name[] \ __attribute__((unused)) \ = { (void *) table_id, \ (void *) subtable, \ (void *) valid, \ (void *) fn, \ (void *) data } #define acpi_probe_device_table(t) ({ int __r = 0; __r;}) #endif #ifdef CONFIG_ACPI_TABLE_UPGRADE void acpi_table_upgrade(void); #else static inline void acpi_table_upgrade(void) { } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG) extern bool acpi_has_watchdog(void); #else static inline bool acpi_has_watchdog(void) { return false; } #endif #ifdef CONFIG_ACPI_SPCR_TABLE extern bool qdf2400_e44_present; int acpi_parse_spcr(bool enable_earlycon, bool enable_console); #else static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) { return 0; } #endif #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } #endif #ifdef CONFIG_ACPI_LPIT int lpit_read_residency_count_address(u64 *address); #else static inline int lpit_read_residency_count_address(u64 *address) { return -EINVAL; } #endif #ifdef CONFIG_ACPI_PROCESSOR_IDLE #ifndef arch_get_idle_state_flags static inline unsigned int arch_get_idle_state_flags(u32 arch_flags) { return 0; } #endif #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } #endif void acpi_arch_init(void); #ifdef CONFIG_ACPI_PCC void acpi_init_pcc(void); #else static inline void acpi_init_pcc(void) { } #endif #ifdef CONFIG_ACPI_FFH void acpi_init_ffh(void); extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt); extern int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context); #else static inline void acpi_init_ffh(void) { } #endif #ifdef CONFIG_ACPI extern void acpi_device_notify(struct device *dev); extern void acpi_device_notify_remove(struct device *dev); #else static inline void acpi_device_notify(struct device *dev) { } static inline void acpi_device_notify_remove(struct device *dev) { } #endif static inline void acpi_use_parent_companion(struct device *dev) { ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } #ifdef CONFIG_ACPI_HMAT int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, enum access_coordinate_class access); #else static inline int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, enum access_coordinate_class access) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_ACPI_NUMA bool acpi_node_backed_by_real_pxm(int nid); #else static inline bool acpi_node_backed_by_real_pxm(int nid) { return false; } #endif #endif /*_LINUX_ACPI_H*/
21 10 3 2 1 2 3 12 12 1 4 5 8 8 6 5 1 2 1 5 5 5 5 4 17 3 16 4 4 15 5 1 19 19 2 20 15 2 29 1 27 2 27 2 23 5 4 25 18 5 9 10 3 1 21 3 1 5 5 5 5 5 1 4 5 5 56 44 33 57 2181 2184 2187 748 70 2 2 2 2 2 5 2 56 62 62 12 1 1 1 1 1 3 1 1 3 3 1 34 4 30 30 1 1 59 59 3 46 9 1 4 51 34 29 30 30 30 30 30 10 9 6 6 5 1 5 5 5 5 5 1 4 5 25 2 2 2 59 59 1 58 59 59 57 2 58 1 58 61 60 1 1 59 3 59 4 4 4 4 4 5 5 5 3 5 5 5 5 5 5 5 5 5 5 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); return -EINVAL; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt) { if (!dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } return 0; } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); oper = rtnl_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rtnl_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; u32 taprio_flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL; } if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } /* Not going to race against advance_sched(), but still */ admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; rcu_read_lock(); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error_rcu; if (oper && dump_schedule(skb, oper)) goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error_rcu: rcu_read_unlock(); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc");
7 7 7 7 7 7 7 7 7 1 1 3 3 4 4 1 4 4 4 10 10 1 1 5 3 2 7 2 5 3 1 2 2 1 4 4 1 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/if_arp.h> #include <net/netlink.h> #include <net/genetlink.h> #include <net/cfg802154.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> #include <net/rtnetlink.h> /* for rtnl_{un,}lock */ #include <linux/nl802154.h> #include "ieee802154.h" #include "rdev-ops.h" #include "core.h" static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct wpan_phy *phy) { void *hdr; int i, pages = 0; u32 *buf = kcalloc(IEEE802154_MAX_PAGE + 1, sizeof(u32), GFP_KERNEL); pr_debug("%s\n", __func__); if (!buf) return -EMSGSIZE; hdr = genlmsg_put(msg, 0, seq, &nl802154_family, flags, IEEE802154_LIST_PHY); if (!hdr) goto out; rtnl_lock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) || nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel)) goto nla_put_failure; for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { if (phy->supported.channels[i]) buf[pages++] = phy->supported.channels[i] | (i << 27); } if (pages && nla_put(msg, IEEE802154_ATTR_CHANNEL_PAGE_LIST, pages * sizeof(uint32_t), buf)) goto nla_put_failure; rtnl_unlock(); kfree(buf); genlmsg_end(msg, hdr); return 0; nla_put_failure: rtnl_unlock(); genlmsg_cancel(msg, hdr); out: kfree(buf); return -EMSGSIZE; } int ieee802154_list_phy(struct sk_buff *skb, struct genl_info *info) { /* Request for interface name, index, type, IEEE address, * PAN Id, short address */ struct sk_buff *msg; struct wpan_phy *phy; const char *name; int rc = -ENOBUFS; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ phy = wpan_phy_find(name); if (!phy) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out_dev; rc = ieee802154_nl_fill_phy(msg, info->snd_portid, info->snd_seq, 0, phy); if (rc < 0) goto out_free; wpan_phy_put(phy); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_dev: wpan_phy_put(phy); return rc; } struct dump_phy_data { struct sk_buff *skb; struct netlink_callback *cb; int idx, s_idx; }; static int ieee802154_dump_phy_iter(struct wpan_phy *phy, void *_data) { int rc; struct dump_phy_data *data = _data; pr_debug("%s\n", __func__); if (data->idx++ < data->s_idx) return 0; rc = ieee802154_nl_fill_phy(data->skb, NETLINK_CB(data->cb->skb).portid, data->cb->nlh->nlmsg_seq, NLM_F_MULTI, phy); if (rc < 0) { data->idx--; return rc; } return 0; } int ieee802154_dump_phy(struct sk_buff *skb, struct netlink_callback *cb) { struct dump_phy_data data = { .cb = cb, .skb = skb, .s_idx = cb->args[0], .idx = 0, }; pr_debug("%s\n", __func__); wpan_phy_for_each(ieee802154_dump_phy_iter, &data); cb->args[0] = data.idx; return skb->len; } int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct wpan_phy *phy; const char *name; const char *devname; int rc = -ENOBUFS; struct net_device *dev; int type = __IEEE802154_DEV_INVALID; unsigned char name_assign_type; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_PHY_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { devname = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); if (devname[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* phy name should be null-terminated */ name_assign_type = NET_NAME_USER; } else { devname = "wpan%d"; name_assign_type = NET_NAME_ENUM; } if (strlen(devname) >= IFNAMSIZ) return -ENAMETOOLONG; phy = wpan_phy_find(name); if (!phy) return -ENODEV; msg = ieee802154_nl_new_reply(info, 0, IEEE802154_ADD_IFACE); if (!msg) goto out_dev; if (info->attrs[IEEE802154_ATTR_HW_ADDR] && nla_len(info->attrs[IEEE802154_ATTR_HW_ADDR]) != IEEE802154_ADDR_LEN) { rc = -EINVAL; goto nla_put_failure; } if (info->attrs[IEEE802154_ATTR_DEV_TYPE]) { type = nla_get_u8(info->attrs[IEEE802154_ATTR_DEV_TYPE]); if (type >= __IEEE802154_DEV_MAX) { rc = -EINVAL; goto nla_put_failure; } } dev = rdev_add_virtual_intf_deprecated(wpan_phy_to_rdev(phy), devname, name_assign_type, type); if (IS_ERR(dev)) { rc = PTR_ERR(dev); goto nla_put_failure; } dev_hold(dev); if (info->attrs[IEEE802154_ATTR_HW_ADDR]) { struct sockaddr addr; addr.sa_family = ARPHRD_IEEE802154; nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], IEEE802154_ADDR_LEN); /* strangely enough, some callbacks (inetdev_event) from * dev_set_mac_address require RTNL_LOCK */ rtnl_lock(); rc = dev_set_mac_address(dev, &addr, NULL); rtnl_unlock(); if (rc) goto dev_unregister; } if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) { rc = -EMSGSIZE; goto nla_put_failure; } dev_put(dev); wpan_phy_put(phy); return ieee802154_nl_reply(msg, info); dev_unregister: rtnl_lock(); /* del_iface must be called with RTNL lock */ rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev); dev_put(dev); rtnl_unlock(); nla_put_failure: nlmsg_free(msg); out_dev: wpan_phy_put(phy); return rc; } int ieee802154_del_iface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct wpan_phy *phy; const char *name; int rc; struct net_device *dev; pr_debug("%s\n", __func__); if (!info->attrs[IEEE802154_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[IEEE802154_ATTR_DEV_NAME]); if (name[nla_len(info->attrs[IEEE802154_ATTR_DEV_NAME]) - 1] != '\0') return -EINVAL; /* name should be null-terminated */ rc = -ENODEV; dev = dev_get_by_name(genl_info_net(info), name); if (!dev) return rc; if (dev->type != ARPHRD_IEEE802154) goto out; phy = dev->ieee802154_ptr->wpan_phy; BUG_ON(!phy); get_device(&phy->dev); rc = -EINVAL; /* phy name is optional, but should be checked if it's given */ if (info->attrs[IEEE802154_ATTR_PHY_NAME]) { struct wpan_phy *phy2; const char *pname = nla_data(info->attrs[IEEE802154_ATTR_PHY_NAME]); if (pname[nla_len(info->attrs[IEEE802154_ATTR_PHY_NAME]) - 1] != '\0') /* name should be null-terminated */ goto out_dev; phy2 = wpan_phy_find(pname); if (!phy2) goto out_dev; if (phy != phy2) { wpan_phy_put(phy2); goto out_dev; } } rc = -ENOBUFS; msg = ieee802154_nl_new_reply(info, 0, IEEE802154_DEL_IFACE); if (!msg) goto out_dev; rtnl_lock(); rdev_del_virtual_intf_deprecated(wpan_phy_to_rdev(phy), dev); /* We don't have device anymore */ dev_put(dev); dev = NULL; rtnl_unlock(); if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) || nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, name)) goto nla_put_failure; wpan_phy_put(phy); return ieee802154_nl_reply(msg, info); nla_put_failure: nlmsg_free(msg); out_dev: wpan_phy_put(phy); out: dev_put(dev); return rc; }
97 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 // SPDX-License-Identifier: GPL-2.0-or-later /* AFS client file system * * Copyright (C) 2002,5 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/sched.h> #include <linux/random.h> #include <linux/proc_fs.h> #define CREATE_TRACE_POINTS #include "internal.h" MODULE_DESCRIPTION("AFS Client File System"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); unsigned afs_debug; module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(debug, "AFS debugging mask"); static char *rootcell; module_param(rootcell, charp, 0); MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list"); struct workqueue_struct *afs_wq; static struct proc_dir_entry *afs_proc_symlink; #if defined(CONFIG_ALPHA) const char afs_init_sysname[] = "alpha_linux26"; #elif defined(CONFIG_X86_64) const char afs_init_sysname[] = "amd64_linux26"; #elif defined(CONFIG_ARM) const char afs_init_sysname[] = "arm_linux26"; #elif defined(CONFIG_ARM64) const char afs_init_sysname[] = "aarch64_linux26"; #elif defined(CONFIG_X86_32) const char afs_init_sysname[] = "i386_linux26"; #elif defined(CONFIG_PPC64) const char afs_init_sysname[] = "ppc64_linux26"; #elif defined(CONFIG_PPC32) const char afs_init_sysname[] = "ppc_linux26"; #elif defined(CONFIG_S390) #ifdef CONFIG_64BIT const char afs_init_sysname[] = "s390x_linux26"; #else const char afs_init_sysname[] = "s390_linux26"; #endif #elif defined(CONFIG_SPARC64) const char afs_init_sysname[] = "sparc64_linux26"; #elif defined(CONFIG_SPARC32) const char afs_init_sysname[] = "sparc_linux26"; #else const char afs_init_sysname[] = "unknown_linux26"; #endif /* * Initialise an AFS network namespace record. */ static int __net_init afs_net_init(struct net *net_ns) { struct afs_sysnames *sysnames; struct afs_net *net = afs_net(net_ns); int ret; net->net = net_ns; net->live = true; generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; idr_init(&net->cells_dyn_ino); init_rwsem(&net->cells_lock); mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); ret = -ENOMEM; sysnames = kzalloc(sizeof(*sysnames), GFP_KERNEL); if (!sysnames) goto error_sysnames; sysnames->subs[0] = (char *)&afs_init_sysname; sysnames->nr = 1; refcount_set(&sysnames->usage, 1); net->sysnames = sysnames; rwlock_init(&net->sysnames_lock); /* Register the /proc stuff */ ret = afs_proc_init(net); if (ret < 0) goto error_proc; /* Initialise the cell DB */ ret = afs_cell_init(net, rootcell); if (ret < 0) goto error_cell_init; /* Create the RxRPC transport */ ret = afs_open_socket(net); if (ret < 0) goto error_open_socket; return 0; error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_wait_for_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: idr_destroy(&net->cells_dyn_ino); net->live = false; return ret; } /* * Clean up and destroy an AFS network namespace record. */ static void __net_exit afs_net_exit(struct net *net_ns) { struct afs_net *net = afs_net(net_ns); net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); afs_wait_for_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); idr_destroy(&net->cells_dyn_ino); kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } static struct pernet_operations afs_net_ops = { .init = afs_net_init, .exit = afs_net_exit, .id = &afs_net_id, .size = sizeof(struct afs_net), }; /* * initialise the AFS client FS module */ static int __init afs_init(void) { int ret = -ENOMEM; printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n"); afs_wq = alloc_workqueue("afs", 0, 0); if (!afs_wq) goto error_afs_wq; afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (!afs_async_calls) goto error_async; afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); if (!afs_lock_manager) goto error_lockmgr; ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; /* register the filesystems */ ret = afs_fs_init(); if (ret < 0) goto error_fs; afs_proc_symlink = proc_symlink("fs/afs", NULL, "../self/net/afs"); if (!afs_proc_symlink) { ret = -ENOMEM; goto error_proc; } return ret; error_proc: afs_fs_exit(); error_fs: unregister_pernet_device(&afs_net_ops); error_net: destroy_workqueue(afs_lock_manager); error_lockmgr: destroy_workqueue(afs_async_calls); error_async: destroy_workqueue(afs_wq); error_afs_wq: rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); return ret; } /* XXX late_initcall is kludgy, but the only alternative seems to create * a transport upon the first mount, which is worse. Or is it? */ late_initcall(afs_init); /* must be called after net/ to create socket */ /* * clean up on module removal */ static void __exit afs_exit(void) { printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n"); proc_remove(afs_proc_symlink); afs_fs_exit(); unregister_pernet_device(&afs_net_ops); destroy_workqueue(afs_lock_manager); destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); afs_clean_up_permit_cache(); rcu_barrier(); } module_exit(afs_exit);
4846 4845 4838 4857 486 4831 4836 4850 4838 4857 4843 4803 4802 4806 4804 4805 51 4797 4803 4804 51 4805 4804 4801 4800 472 4857 485 40 40 465 464 4841 4847 4841 4857 4757 11 4841 25 4831 4857 4741 4857 4844 4857 4841 4857 4840 8 4740 261 4849 4840 3 8 8 3 4835 4743 263 25 4832 4857 4848 4841 4841 383 4847 4843 4844 390 4857 4847 4742 4742 4845 486 4832 4838 4850 4846 4832 4833 4796 4834 4842 81 4802 4806 4804 4817 4843 4849 4847 4856 4857 473 4839 4840 4831 4820 4838 4849 472 4839 473 4829 4844 4857 4842 1591 4690 4846 382 4842 8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Modified to make sys_syslog() more flexible: added commands to * return the last 4k of kernel messages, regardless of whether * they've been read or not. Added option to suppress kernel printk's * to the console. Added hook for sending the console messages * elsewhere, in preparation for a serial line console (someday). * Ted Ts'o, 2/11/93. * Modified for sysctl support, 1/8/97, Chris Horn. * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock * 01Mar01 Andrew Morton */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/console.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/nmi.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/smp.h> #include <linux/security.h> #include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/syscore_ops.h> #include <linux/vmcore_info.h> #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> #include <linux/cpu.h> #include <linux/rculist.h> #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/uaccess.h> #include <asm/sections.h> #include <trace/events/initcall.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" #include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); EXPORT_TRACEPOINT_SYMBOL_GPL(console); /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. */ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* * console_mutex protects console_list updates and console->flags updates. * The flags are synchronized only for consoles that are registered, i.e. * accessible via the console list. */ static DEFINE_MUTEX(console_mutex); /* * console_sem protects updates to console->seq * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem, 1); HLIST_HEAD(console_list); EXPORT_SYMBOL_GPL(console_list); DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain * circumstances, like after kernel panic happens. */ int __read_mostly suppress_printk; #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; void lockdep_assert_console_list_lock_held(void) { lockdep_assert_held(&console_mutex); } EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC bool console_srcu_read_lock_is_held(void) { return srcu_read_lock_held(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock_is_held); #endif enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, __DEVKMSG_LOG_BIT_LOCK, }; enum devkmsg_log_masks { DEVKMSG_LOG_MASK_ON = BIT(__DEVKMSG_LOG_BIT_ON), DEVKMSG_LOG_MASK_OFF = BIT(__DEVKMSG_LOG_BIT_OFF), DEVKMSG_LOG_MASK_LOCK = BIT(__DEVKMSG_LOG_BIT_LOCK), }; /* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */ #define DEVKMSG_LOG_MASK_DEFAULT 0 static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; static int __control_devkmsg(char *str) { size_t len; if (!str) return -EINVAL; len = str_has_prefix(str, "on"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_ON; return len; } len = str_has_prefix(str, "off"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_OFF; return len; } len = str_has_prefix(str, "ratelimit"); if (len) { devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT; return len; } return -EINVAL; } static int __init control_devkmsg(char *str) { if (__control_devkmsg(str) < 0) { pr_warn("printk.devkmsg: bad option string '%s'\n", str); return 1; } /* * Set sysctl string accordingly: */ if (devkmsg_log == DEVKMSG_LOG_MASK_ON) strscpy(devkmsg_log_str, "on"); else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) strscpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* * Sysctl cannot change it anymore. The kernel command line setting of * this parameter is to force the setting to be permanent throughout the * runtime of the system. This is a precation measure against userspace * trying to be a smarta** and attempting to change it up on us. */ devkmsg_log |= DEVKMSG_LOG_MASK_LOCK; return 1; } __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; int err; if (write) { if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK) return -EINVAL; old = devkmsg_log; strscpy(old_str, devkmsg_log_str); } err = proc_dostring(table, write, buffer, lenp, ppos); if (err) return err; if (write) { err = __control_devkmsg(devkmsg_log_str); /* * Do not accept an unknown string OR a known string with * trailing crap... */ if (err < 0 || (err + 1 != *lenp)) { /* ... and restore old setting. */ devkmsg_log = old; strscpy(devkmsg_log_str, old_str); return -EINVAL; } } return 0; } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /** * console_list_lock - Lock the console list * * For console list or console->flags updates */ void console_list_lock(void) { /* * In unregister_console() and console_force_preferred_locked(), * synchronize_srcu() is called with the console_list_lock held. * Therefore it is not allowed that the console_list_lock is taken * with the srcu_lock held. * * Detecting if this context is really in the read-side critical * section is only possible if the appropriate debug options are * enabled. */ WARN_ON_ONCE(debug_lockdep_rcu_enabled() && srcu_read_lock_held(&console_srcu)); mutex_lock(&console_mutex); } EXPORT_SYMBOL(console_list_lock); /** * console_list_unlock - Unlock the console list * * Counterpart to console_list_lock() */ void console_list_unlock(void) { mutex_unlock(&console_mutex); } EXPORT_SYMBOL(console_list_unlock); /** * console_srcu_read_lock - Register a new reader for the * SRCU-protected console list * * Use for_each_console_srcu() to iterate the console list * * Context: Any context. * Return: A cookie to pass to console_srcu_read_unlock(). */ int console_srcu_read_lock(void) __acquires(&console_srcu) { return srcu_read_lock_nmisafe(&console_srcu); } EXPORT_SYMBOL(console_srcu_read_lock); /** * console_srcu_read_unlock - Unregister an old reader from * the SRCU-protected console list * @cookie: cookie returned from console_srcu_read_lock() * * Counterpart to console_srcu_read_lock() */ void console_srcu_read_unlock(int cookie) __releases(&console_srcu) { srcu_read_unlock_nmisafe(&console_srcu, cookie); } EXPORT_SYMBOL(console_srcu_read_unlock); /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. */ #define down_console_sem() do { \ down(&console_sem);\ mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ } while (0) static int __down_trylock_console_sem(unsigned long ip) { int lock_failed; unsigned long flags; /* * Here and in __up_console_sem() we need to be in safe mode, * because spindump/WARN/etc from under console ->lock will * deadlock in printk()->down_trylock_console_sem() otherwise. */ printk_safe_enter_irqsave(flags); lock_failed = down_trylock(&console_sem); printk_safe_exit_irqrestore(flags); if (lock_failed) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; } #define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) static void __up_console_sem(unsigned long ip) { unsigned long flags; mutex_release(&console_lock_dep_map, ip); printk_safe_enter_irqsave(flags); up(&console_sem); printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) static bool panic_in_progress(void) { return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); } /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { /* * We can use raw_smp_processor_id() here because it is impossible for * the task to be migrated to the panic_cpu, or away from it. If * panic_cpu has already been set, and we're not currently executing on * that CPU, then we never will be. */ return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); } /* * Return true if a panic is in progress on a remote CPU. * * On true, the local CPU should immediately release any printing resources * that may be needed by the panic CPU. */ bool other_cpu_in_panic(void) { return (panic_in_progress() && !this_cpu_in_panic()); } /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ * hold it and are racing, but it helps tracking those weird code * paths in the console code where we end up in places I want * locked without the console semaphore held). */ static int console_locked; /* * Array of consoles built from command line options (console=) */ #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; enum con_msg_format_flags { MSG_FORMAT_DEFAULT = 0, MSG_FORMAT_SYSLOG = (1 << 0), }; static int console_msg_format = MSG_FORMAT_DEFAULT; /* * The printk log buffer consists of a sequenced collection of records, each * containing variable length message text. Every record also contains its * own meta-data (@info). * * Every record meta-data carries the timestamp in microseconds, as well as * the standard userspace syslog level and syslog facility. The usual kernel * messages use LOG_KERN; userspace-injected messages always carry a matching * syslog facility, by default LOG_USER. The origin of every message can be * reliably determined that way. * * The human readable log message of a record is available in @text, the * length of the message text in @text_len. The stored message is not * terminated. * * Optionally, a record can carry a dictionary of properties (key/value * pairs), to provide userspace with a machine-readable message context. * * Examples for well-defined, commonly used property names are: * DEVICE=b12:8 device identifier * b12:8 block dev_t * c127:3 char dev_t * n8 netdev ifindex * +sound:card0 subsystem:devname * SUBSYSTEM=pci driver-core subsystem name * * Valid characters in property names are [a-zA-Z0-9.-_]. Property names * and values are terminated by a '\0' character. * * Example of record values: * record.text_buf = "it's a line" (unterminated) * record.info.seq = 56 * record.info.ts_nsec = 36863 * record.info.text_len = 11 * record.info.facility = 0 (LOG_KERN) * record.info.flags = 0 * record.info.level = 3 (LOG_ERR) * record.info.caller_id = 299 (task 299) * record.info.dev_info.subsystem = "pci" (terminated) * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated) * * The 'struct printk_info' buffer must never be directly exported to * userspace, it is a kernel-private implementation detail that might * need to be changed in the future, when the requirements change. * * /dev/kmsg exports the structured data in the following line format: * "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n" * * Users of the export format should ignore possible additional values * separated by ',', and find the message after the ';' character. * * The optional key/value pairs are attached as continuation lines starting * with a space character and terminated by a newline. All possible * non-prinatable characters are escaped in the "\xff" notation. */ /* syslog_lock protects syslog_* variables and write access to clear_seq. */ static DEFINE_MUTEX(syslog_lock); /* * Specifies if a legacy console is registered. If legacy consoles are * present, it is necessary to perform the console lock/unlock dance * whenever console flushing should occur. */ bool have_legacy_console; /* * Specifies if an nbcon console is registered. If nbcon consoles are present, * synchronous printing of legacy consoles will not occur during panic until * the backtrace has been stored to the ringbuffer. */ bool have_nbcon_console; /* * Specifies if a boot console is registered. If boot consoles are present, * nbcon consoles cannot print simultaneously and must be synchronized by * the console lock. This is because boot consoles and nbcon consoles may * have mapped the same hardware. */ bool have_boot_console; /* See printk_legacy_allow_panic_sync() for details. */ bool legacy_allow_panic_sync; #ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); static DECLARE_WAIT_QUEUE_HEAD(legacy_wait); /* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; /* True when _all_ printer threads are available for printing. */ bool printk_kthreads_running; struct latched_seq { seqcount_latch_t latch; u64 val[2]; }; /* * The next printk record to read after the last 'clear' command. There are * two copies (updated with seqcount_latch) so that reads can locklessly * access a valid value. Writers are synchronized by @syslog_lock. */ static struct latched_seq clear_seq = { .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), .val[0] = 0, .val[1] = 0, }; #define LOG_LEVEL(v) ((v) & 0x07) #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ #define LOG_ALIGN __alignof__(unsigned long) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) #define LOG_BUF_LEN_MAX ((u32)1 << 31) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; /* * Define the average message size. This only affects the number of * descriptors that will be available. Underestimating is better than * overestimating (too many available descriptors is better than not enough). */ #define PRB_AVGBITS 5 /* 32 character average length */ #if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS #error CONFIG_LOG_BUF_SHIFT value too small. #endif _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS, PRB_AVGBITS, &__log_buf[0]); static struct printk_ringbuffer printk_rb_dynamic; struct printk_ringbuffer *prb = &printk_rb_static; /* * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } /* Must be called under syslog_lock. */ static void latched_seq_write(struct latched_seq *ls, u64 val) { write_seqcount_latch_begin(&ls->latch); ls->val[0] = val; write_seqcount_latch(&ls->latch); ls->val[1] = val; write_seqcount_latch_end(&ls->latch); } /* Can be called from any context. */ static u64 latched_seq_read_nolock(struct latched_seq *ls) { unsigned int seq; unsigned int idx; u64 val; do { seq = read_seqcount_latch(&ls->latch); idx = seq & 0x1; val = ls->val[idx]; } while (read_seqcount_latch_retry(&ls->latch, seq)); return val; } /* Return log buffer address */ char *log_buf_addr_get(void) { return log_buf; } /* Return log buffer size */ u32 log_buf_len_get(void) { return log_buf_len; } /* * Define how much of the log buffer we could take at maximum. The value * must be greater than two. Note that only half of the buffer is available * when the index points to the middle. */ #define MAX_LOG_TAKE_PART 4 static const char trunc_msg[] = "<truncated>"; static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) { /* * The message should not take the whole buffer. Otherwise, it might * get removed too soon. */ u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; if (*text_len > max_text_len) *text_len = max_text_len; /* enable the warning message (if there is room) */ *trunc_msg_len = strlen(trunc_msg); if (*text_len >= *trunc_msg_len) *text_len -= *trunc_msg_len; else *trunc_msg_len = 0; } int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { if (dmesg_restrict) return 1; /* * Unless restricted, we allow "read all" and "get buffer size" * for everybody. */ return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; } static int check_syslog_permissions(int type, int source) { /* * If this is from /proc/kmsg and we've already opened it, then we've * already done the capabilities checks at open time. */ if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN) goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) goto ok; return -EPERM; } ok: return security_syslog(type); } static void append_char(char **pp, char *e, char c) { if (*pp < e) *(*pp)++ = c; } static ssize_t info_print_ext_header(char *buf, size_t size, struct printk_info *info) { u64 ts_usec = info->ts_nsec; char caller[20]; #ifdef CONFIG_PRINTK_CALLER u32 id = info->caller_id; snprintf(caller, sizeof(caller), ",caller=%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); #else caller[0] = '\0'; #endif do_div(ts_usec, 1000); return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", (info->facility << 3) | info->level, info->seq, ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_add_ext_text(char *buf, size_t size, const char *text, size_t text_len, unsigned char endc) { char *p = buf, *e = buf + size; size_t i; /* escape non-printable characters */ for (i = 0; i < text_len; i++) { unsigned char c = text[i]; if (c < ' ' || c >= 127 || c == '\\') p += scnprintf(p, e - p, "\\x%02x", c); else append_char(&p, e, c); } append_char(&p, e, endc); return p - buf; } static ssize_t msg_add_dict_text(char *buf, size_t size, const char *key, const char *val) { size_t val_len = strlen(val); ssize_t len; if (!val_len) return 0; len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '='); len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n'); return len; } static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { ssize_t len; len = msg_add_ext_text(buf, size, text, text_len, '\n'); if (!dev_info) goto out; len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM", dev_info->subsystem); len += msg_add_dict_text(buf + len, size - len, "DEVICE", dev_info->device); out: return len; } /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { atomic64_t seq; struct ratelimit_state rs; struct mutex lock; struct printk_buffers pbufs; }; static __printf(3, 4) __cold int devkmsg_emit(int facility, int level, const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_emit(facility, level, NULL, fmt, args); va_end(args); return r; } static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ struct file *file = iocb->ki_filp; struct devkmsg_user *user = file->private_data; size_t len = iov_iter_count(from); ssize_t ret = len; if (len > PRINTKRB_RECORD_MAX) return -EINVAL; /* Ignore when user logging is disabled. */ if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return len; /* Ratelimit when not explicitly enabled. */ if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) { if (!___ratelimit(&user->rs, current->comm)) return ret; } buf = kmalloc(len+1, GFP_KERNEL); if (buf == NULL) return -ENOMEM; buf[len] = '\0'; if (!copy_from_iter_full(buf, len, from)) { kfree(buf); return -EFAULT; } /* * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace * the decimal value represents 32bit, the lower 3 bit are the log * level, the rest are the log facility. * * If no prefix or no userspace facility is specified, we * enforce LOG_USER, to be able to reliably distinguish * kernel-generated messages from userspace-injected ones. */ line = buf; if (line[0] == '<') { char *endp = NULL; unsigned int u; u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { level = LOG_LEVEL(u); if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; line = endp; } } devkmsg_emit(facility, level, "%s", line); kfree(buf); return ret; } static ssize_t devkmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct devkmsg_user *user = file->private_data; char *outbuf = &user->pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &user->pbufs, }; ssize_t ret; ret = mutex_lock_interruptible(&user->lock); if (ret) return ret; if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; goto out; } /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ ret = wait_event_interruptible(log_wait, printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)); /* LMM(devkmsg_read:A) */ if (ret) goto out; } if (pmsg.dropped) { /* our last seen message is gone, return error and reset */ atomic64_set(&user->seq, pmsg.seq); ret = -EPIPE; goto out; } atomic64_set(&user->seq, pmsg.seq + 1); if (pmsg.outbuf_len > count) { ret = -EINVAL; goto out; } if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) { ret = -EFAULT; goto out; } ret = pmsg.outbuf_len; out: mutex_unlock(&user->lock); return ret; } /* * Be careful when modifying this function!!! * * Only few operations are supported because the device works only with the * entire variable length messages (records). Non-standard values are * returned in the other cases and has been this way for quite some time. * User space applications might depend on this behavior. */ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) { struct devkmsg_user *user = file->private_data; loff_t ret = 0; if (offset) return -ESPIPE; switch (whence) { case SEEK_SET: /* the first record */ atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* * The first record after the last SYSLOG_ACTION_CLEAR, * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } return ret; } static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; struct printk_info info; __poll_t ret = 0; poll_wait(file, &log_wait, wait); if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } return ret; } static int devkmsg_open(struct inode *inode, struct file *file) { struct devkmsg_user *user; int err; if (devkmsg_log & DEVKMSG_LOG_MASK_OFF) return -EPERM; /* write-only does not need any file context */ if ((file->f_flags & O_ACCMODE) != O_WRONLY) { err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, SYSLOG_FROM_READER); if (err) return err; } user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); if (!user) return -ENOMEM; ratelimit_default_init(&user->rs); ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE); mutex_init(&user->lock); atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; } static int devkmsg_release(struct inode *inode, struct file *file) { struct devkmsg_user *user = file->private_data; ratelimit_state_exit(&user->rs); mutex_destroy(&user->lock); kvfree(user); return 0; } const struct file_operations kmsg_fops = { .open = devkmsg_open, .read = devkmsg_read, .write_iter = devkmsg_write, .llseek = devkmsg_llseek, .poll = devkmsg_poll, .release = devkmsg_release, }; #ifdef CONFIG_VMCORE_INFO /* * This appends the listed symbols to /proc/vmcore * * /proc/vmcore is used by various utilities, like crash and makedumpfile to * obtain access to symbols that are otherwise very difficult to locate. These * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ void log_buf_vmcoreinfo_setup(void) { struct dev_printk_info *dev_info = NULL; VMCOREINFO_SYMBOL(prb); VMCOREINFO_SYMBOL(printk_rb_static); VMCOREINFO_SYMBOL(clear_seq); /* * Export struct size and field offsets. User space tools can * parse it and detect any changes to structure down the line. */ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer); VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring); VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring); VMCOREINFO_OFFSET(printk_ringbuffer, fail); VMCOREINFO_STRUCT_SIZE(prb_desc_ring); VMCOREINFO_OFFSET(prb_desc_ring, count_bits); VMCOREINFO_OFFSET(prb_desc_ring, descs); VMCOREINFO_OFFSET(prb_desc_ring, infos); VMCOREINFO_OFFSET(prb_desc_ring, head_id); VMCOREINFO_OFFSET(prb_desc_ring, tail_id); VMCOREINFO_STRUCT_SIZE(prb_desc); VMCOREINFO_OFFSET(prb_desc, state_var); VMCOREINFO_OFFSET(prb_desc, text_blk_lpos); VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos); VMCOREINFO_OFFSET(prb_data_blk_lpos, begin); VMCOREINFO_OFFSET(prb_data_blk_lpos, next); VMCOREINFO_STRUCT_SIZE(printk_info); VMCOREINFO_OFFSET(printk_info, seq); VMCOREINFO_OFFSET(printk_info, ts_nsec); VMCOREINFO_OFFSET(printk_info, text_len); VMCOREINFO_OFFSET(printk_info, caller_id); VMCOREINFO_OFFSET(printk_info, dev_info); VMCOREINFO_STRUCT_SIZE(dev_printk_info); VMCOREINFO_OFFSET(dev_printk_info, subsystem); VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem)); VMCOREINFO_OFFSET(dev_printk_info, device); VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device)); VMCOREINFO_STRUCT_SIZE(prb_data_ring); VMCOREINFO_OFFSET(prb_data_ring, size_bits); VMCOREINFO_OFFSET(prb_data_ring, data); VMCOREINFO_OFFSET(prb_data_ring, head_lpos); VMCOREINFO_OFFSET(prb_data_ring, tail_lpos); VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); VMCOREINFO_STRUCT_SIZE(latched_seq); VMCOREINFO_OFFSET(latched_seq, val); } #endif /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; /* we practice scaling the ring buffer by powers of 2 */ static void __init log_buf_len_update(u64 size) { if (size > (u64)LOG_BUF_LEN_MAX) { size = (u64)LOG_BUF_LEN_MAX; pr_err("log_buf over 2G is not supported.\n"); } if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = (unsigned long)size; } /* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { u64 size; if (!str) return -EINVAL; size = memparse(str, &str); log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); #ifdef CONFIG_SMP #define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static void __init log_buf_add_cpu(void) { unsigned int cpu_extra; /* * archs should set up cpu_possible_bits properly with * set_cpu_possible() after setup_arch() but just in * case lets ensure this is valid. */ if (num_possible_cpus() == 1) return; cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; /* by default this will only continue through for large > 64 CPUs */ if (cpu_extra <= __LOG_BUF_LEN / 2) return; pr_info("log_buf_len individual max cpu contribution: %d bytes\n", __LOG_CPU_MAX_BUF_LEN); pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", cpu_extra); pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); log_buf_len_update(cpu_extra + __LOG_BUF_LEN); } #else /* !CONFIG_SMP */ static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ static void __init set_percpu_data_ready(void) { __printk_percpu_data_ready = true; } static unsigned int __init add_to_rb(struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_reserved_entry e; struct printk_record dest_r; prb_rec_init_wr(&dest_r, r->info->text_len); if (!prb_reserve(&e, rb, &dest_r)) return 0; memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len); dest_r.info->text_len = r->info->text_len; dest_r.info->facility = r->info->facility; dest_r.info->level = r->info->level; dest_r.info->flags = r->info->flags; dest_r.info->ts_nsec = r->info->ts_nsec; dest_r.info->caller_id = r->info->caller_id; memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info)); prb_final_commit(&e); return prb_record_text_space(&e); } static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata; static void print_log_buf_usage_stats(void) { unsigned int descs_count = log_buf_len >> PRB_AVGBITS; size_t meta_data_size; meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info)); pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n", log_buf_len, meta_data_size, log_buf_len + meta_data_size); } void __init setup_log_buf(int early) { struct printk_info *new_infos; unsigned int new_descs_count; struct prb_desc *new_descs; struct printk_info info; struct printk_record r; unsigned int text_size; size_t new_descs_size; size_t new_infos_size; unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; /* * Some archs call setup_log_buf() multiple times - first is very * early, e.g. from setup_arch(), and second - when percpu_areas * are initialised. */ if (!early) set_percpu_data_ready(); if (log_buf != __log_buf) return; if (!early && !new_log_buf_len) log_buf_add_cpu(); if (!new_log_buf_len) { /* Show the memory stats only once. */ if (!early) goto out; return; } new_descs_count = new_log_buf_len >> PRB_AVGBITS; if (new_descs_count == 0) { pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len); goto out; } new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu text bytes not available\n", new_log_buf_len); goto out; } new_descs_size = new_descs_count * sizeof(struct prb_desc); new_descs = memblock_alloc(new_descs_size, LOG_ALIGN); if (unlikely(!new_descs)) { pr_err("log_buf_len: %zu desc bytes not available\n", new_descs_size); goto err_free_log_buf; } new_infos_size = new_descs_count * sizeof(struct printk_info); new_infos = memblock_alloc(new_infos_size, LOG_ALIGN); if (unlikely(!new_infos)) { pr_err("log_buf_len: %zu info bytes not available\n", new_infos_size); goto err_free_descs; } prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf)); prb_init(&printk_rb_dynamic, new_log_buf, ilog2(new_log_buf_len), new_descs, ilog2(new_descs_count), new_infos); local_irq_save(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; free = __LOG_BUF_LEN; prb_for_each_record(0, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } prb = &printk_rb_dynamic; local_irq_restore(flags); /* * Copy any remaining messages that might have appeared from * NMI context after copying but before switching to the * dynamic buffer. */ prb_for_each_record(seq, &printk_rb_static, seq, &r) { text_size = add_to_rb(&printk_rb_dynamic, &r); if (text_size > free) free = 0; else free -= text_size; } if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); } print_log_buf_usage_stats(); pr_info("early log buf free: %u(%u%%)\n", free, (free * 100) / __LOG_BUF_LEN); return; err_free_descs: memblock_free(new_descs, new_descs_size); err_free_log_buf: memblock_free(new_log_buf, new_log_buf_len); out: print_log_buf_usage_stats(); } static bool __read_mostly ignore_loglevel; static int __init ignore_loglevel_setup(char *str) { ignore_loglevel = true; pr_info("debug: ignoring loglevel setting.\n"); return 0; } early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting (prints all kernel messages to the console)"); static bool suppress_message_printing(int level) { return (level >= console_loglevel && !ignore_loglevel); } #ifdef CONFIG_BOOT_PRINTK_DELAY static int boot_delay; /* msecs delay after each printk during bootup */ static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; get_option(&str, &boot_delay); if (boot_delay > 10 * 1000) boot_delay = 0; pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " "HZ: %d, loops_per_msec: %llu\n", boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 0; } early_param("boot_delay", boot_delay_setup); static void boot_delay_msec(int level) { unsigned long long k; unsigned long timeout; bool suppress = !is_printk_force_console() && suppress_message_printing(level); if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress) return; k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { k--; cpu_relax(); /* * use (volatile) jiffies to prevent * compiler reduction; loop termination via jiffies * is secondary and may or may not happen. */ if (time_after(jiffies, timeout)) break; touch_nmi_watchdog(); } } #else static inline void boot_delay_msec(int level) { } #endif static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_syslog(unsigned int level, char *buf) { return sprintf(buf, "<%u>", level); } static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } #ifdef CONFIG_PRINTK_CALLER static size_t print_caller(u32 id, char *buf) { char caller[12]; snprintf(caller, sizeof(caller), "%c%u", id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); return sprintf(buf, "[%6s]", caller); } #else #define print_caller(id, buf) 0 #endif static size_t info_print_prefix(const struct printk_info *info, bool syslog, bool time, char *buf) { size_t len = 0; if (syslog) len = print_syslog((info->facility << 3) | info->level, buf); if (time) len += print_time(info->ts_nsec, buf + len); len += print_caller(info->caller_id, buf + len); if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { buf[len++] = ' '; buf[len] = '\0'; } return len; } /* * Prepare the record for printing. The text is shifted within the given * buffer to avoid a need for another one. The following operations are * done: * * - Add prefix for each line. * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). * - Add a string terminator. * * Since the produced string is always terminated, the maximum possible * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added * prefixes and the newline. The terminator is not counted. The dropped * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) { size_t text_len = r->info->text_len; size_t buf_size = r->text_buf_size; char *text = r->text_buf; char prefix[PRINTK_PREFIX_MAX]; bool truncated = false; size_t prefix_len; size_t line_len; size_t len = 0; char *next; /* * If the message was truncated because the buffer was not large * enough, treat the available text as if it were the full text. */ if (text_len > buf_size) text_len = buf_size; prefix_len = info_print_prefix(r->info, syslog, time, prefix); /* * @text_len: bytes of unprocessed text * @line_len: bytes of current line _without_ newline * @text: pointer to beginning of current line * @len: number of bytes prepared in r->text_buf */ for (;;) { next = memchr(text, '\n', text_len); if (next) { line_len = next - text; } else { /* Drop truncated line(s). */ if (truncated) break; line_len = text_len; } /* * Truncate the text if there is not enough space to add the * prefix and a trailing newline and a terminator. */ if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ if (len + prefix_len + line_len + 1 + 1 > buf_size) break; text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); /* * Increment the prepared length to include the text and * prefix that were just moved+copied. Also increment for the * newline at the end of this line. If this is the last line, * there is no newline, but it will be added immediately below. */ len += prefix_len + line_len + 1; if (text_len == line_len) { /* * This is the last line. Add the trailing newline * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; } /* * Advance beyond the added prefix and the related line with * its newline. */ text += prefix_len + line_len + 1; /* * The remaining text has only decreased by the line with its * newline. * * Note that @text_len can become zero. It happens when @text * ended with a newline (either due to truncation or the * original string ending with "\n\n"). The loop is correctly * repeated and (if not truncated) an empty line with a prefix * will be prepared. */ text_len -= line_len + 1; } /* * If a buffer was provided, it will be terminated. Space for the * string terminator is guaranteed to be available. The terminator is * not counted in the return value. */ if (buf_size > 0) r->text_buf[len] = 0; return len; } static size_t get_record_print_text_size(struct printk_info *info, unsigned int line_count, bool syslog, bool time) { char prefix[PRINTK_PREFIX_MAX]; size_t prefix_len; prefix_len = info_print_prefix(info, syslog, time, prefix); /* * Each line will be preceded with a prefix. The intermediate * newlines are already within the text, but a final trailing * newline will be added. */ return ((prefix_len * line_count) + info->text_len + 1); } /* * Beginning with @start_seq, find the first record where it and all following * records up to (but not including) @max_seq fit into @size. * * @max_seq is simply an upper bound and does not need to exist. If the caller * does not require an upper bound, -1 can be used for @max_seq. */ static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, bool syslog, bool time) { struct printk_info info; unsigned int line_count; size_t len = 0; u64 seq; /* Determine the size of the records up to @max_seq. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (info.seq >= max_seq) break; len += get_record_print_text_size(&info, line_count, syslog, time); } /* * Adjust the upper bound for the next loop to avoid subtracting * lengths that were never added. */ if (seq < max_seq) max_seq = seq; /* * Move first record forward until length fits into the buffer. Ignore * newest messages that were not counted in the above cycle. Messages * might appear and get lost in the meantime. This is a best effort * that prevents an infinite loop that could occur with a retry. */ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { if (len <= size || info.seq >= max_seq) break; len -= get_record_print_text_size(&info, line_count, syslog, time); } return seq; } /* The caller is responsible for making sure @size is greater than 0. */ static int syslog_print(char __user *buf, int size) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); mutex_lock(&syslog_lock); /* * Wait for the @syslog_seq record to be available. @syslog_seq may * change while waiting. */ do { seq = syslog_seq; mutex_unlock(&syslog_lock); /* * Guarantee this task is visible on the waitqueue before * checking the wake condition. * * The full memory barrier within set_current_state() of * prepare_to_wait_event() pairs with the full memory barrier * within wq_has_sleeper(). * * This pairs with __wake_up_klogd:A. */ len = wait_event_interruptible(log_wait, prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */ mutex_lock(&syslog_lock); if (len) goto out; } while (syslog_seq != seq); /* * Copy records that fit into the buffer. The above cycle makes sure * that the first record is always available. */ do { size_t n; size_t skip; int err; if (!prb_read_valid(prb, syslog_seq, &r)) break; if (r.info->seq != syslog_seq) { /* message is gone, move to next valid one */ syslog_seq = r.info->seq; syslog_partial = 0; } /* * To keep reading/counting partial line consistent, * use printk_time value as of the beginning of a line. */ if (!syslog_partial) syslog_time = printk_time; skip = syslog_partial; n = record_print_text(&r, true, syslog_time); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_seq = r.info->seq + 1; n -= syslog_partial; syslog_partial = 0; } else if (!len){ /* partial read(), remember position */ n = size; syslog_partial += n; } else n = 0; if (!n) break; mutex_unlock(&syslog_lock); err = copy_to_user(buf, text + skip, n); mutex_lock(&syslog_lock); if (err) { if (!len) len = -EFAULT; break; } len += n; size -= n; buf += n; } while (size); out: mutex_unlock(&syslog_lock); kfree(text); return len; } static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; struct printk_record r; char *text; int len = 0; u64 seq; bool time; text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, size, true, time); prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX); prb_for_each_record(seq, prb, seq, &r) { int textlen; textlen = record_print_text(&r, true, time); if (len + textlen > size) { seq--; break; } if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; if (len < 0) break; } if (clear) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, seq); mutex_unlock(&syslog_lock); } kfree(text); return len; } static void syslog_clear(void) { mutex_lock(&syslog_lock); latched_seq_write(&clear_seq, prb_next_seq(prb)); mutex_unlock(&syslog_lock); } int do_syslog(int type, char __user *buf, int len, int source) { struct printk_info info; bool clear = false; static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, source); if (error) return error; switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; case SYSLOG_ACTION_OPEN: /* Open log */ break; case SYSLOG_ACTION_READ: /* Read from log */ if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print(buf, len); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: clear = true; fallthrough; /* Read last kernel messages */ case SYSLOG_ACTION_READ_ALL: if (!buf || len < 0) return -EINVAL; if (!len) return 0; if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_clear(); break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ case SYSLOG_ACTION_CONSOLE_LEVEL: if (len < 1 || len > 8) return -EINVAL; if (len < minimum_console_loglevel) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ saved_console_loglevel = LOGLEVEL_DEFAULT; break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: mutex_lock(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ mutex_unlock(&syslog_lock); return 0; } if (info.seq != syslog_seq) { /* messages are gone, move to first one */ syslog_seq = info.seq; syslog_partial = 0; } if (source == SYSLOG_FROM_PROC) { /* * Short-cut for poll(/"proc/kmsg") which simply checks * for pending data, not the size; return the count of * records, not the length. */ error = prb_next_seq(prb) - syslog_seq; } else { bool time = syslog_partial ? syslog_time : printk_time; unsigned int line_count; u64 seq; prb_for_each_info(syslog_seq, prb, seq, &info, &line_count) { error += get_record_print_text_size(&info, line_count, true, time); time = printk_time; } error -= syslog_partial; } mutex_unlock(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: error = -EINVAL; break; } return error; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* * Special console_lock variants that help to reduce the risk of soft-lockups. * They allow to pass console_lock to another printk() call using a busy wait. */ #ifdef CONFIG_LOCKDEP static struct lockdep_map console_owner_dep_map = { .name = "console_owner" }; #endif static DEFINE_RAW_SPINLOCK(console_owner_lock); static struct task_struct *console_owner; static bool console_waiter; /** * console_lock_spinning_enable - mark beginning of code where another * thread might safely busy wait * * This basically converts console_lock into a spinlock. This marks * the section where the console_lock owner can not sleep, because * there may be a waiter spinning (like a spinlock). Also it must be * ready to hand over the lock at the end of the section. */ void console_lock_spinning_enable(void) { /* * Do not use spinning in panic(). The panic CPU wants to keep the lock. * Non-panic CPUs abandon the flush anyway. * * Just keep the lockdep annotation. The panic-CPU should avoid * taking console_owner_lock because it might cause a deadlock. * This looks like the easiest way how to prevent false lockdep * reports without handling races a lockless way. */ if (panic_in_progress()) goto lockdep; raw_spin_lock(&console_owner_lock); console_owner = current; raw_spin_unlock(&console_owner_lock); lockdep: /* The waiter may spin on us after setting console_owner */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); } /** * console_lock_spinning_disable_and_check - mark end of code where another * thread was able to busy wait and check if there is a waiter * @cookie: cookie returned from console_srcu_read_lock() * * This is called at the end of the section where spinning is allowed. * It has two functions. First, it is a signal that it is no longer * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * * Important: Callers lose both the console_lock and the SRCU read lock if * there was a busy waiter. They must not touch items synchronized by * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ int console_lock_spinning_disable_and_check(int cookie) { int waiter; /* * Ignore spinning waiters during panic() because they might get stopped * or blocked at any time, * * It is safe because nobody is allowed to start spinning during panic * in the first place. If there has been a waiter then non panic CPUs * might stay spinning. They would get stopped anyway. The panic context * will never start spinning and an interrupted spin on panic CPU will * never continue. */ if (panic_in_progress()) { /* Keep lockdep happy. */ spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } raw_spin_lock(&console_owner_lock); waiter = READ_ONCE(console_waiter); console_owner = NULL; raw_spin_unlock(&console_owner_lock); if (!waiter) { spin_release(&console_owner_dep_map, _THIS_IP_); return 0; } /* The waiter is now free to continue */ WRITE_ONCE(console_waiter, false); spin_release(&console_owner_dep_map, _THIS_IP_); /* * Preserve lockdep lock ordering. Release the SRCU read lock before * releasing the console_lock. */ console_srcu_read_unlock(cookie); /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ mutex_release(&console_lock_dep_map, _THIS_IP_); return 1; } /** * console_trylock_spinning - try to get console_lock by busy waiting * * This allows to busy wait for the console_lock when the current * owner is running in specially marked sections. It means that * the current owner is running and cannot reschedule until it * is ready to lose the lock. * * Return: 1 if we got the lock, 0 othrewise */ static int console_trylock_spinning(void) { struct task_struct *owner = NULL; bool waiter; bool spin = false; unsigned long flags; if (console_trylock()) return 1; /* * It's unsafe to spin once a panic has begun. If we are the * panic CPU, we may have already halted the owner of the * console_sem. If we are not the panic CPU, then we should * avoid taking console_sem, so the panic CPU has a better * chance of cleanly acquiring it later. */ if (panic_in_progress()) return 0; printk_safe_enter_irqsave(flags); raw_spin_lock(&console_owner_lock); owner = READ_ONCE(console_owner); waiter = READ_ONCE(console_waiter); if (!waiter && owner && owner != current) { WRITE_ONCE(console_waiter, true); spin = true; } raw_spin_unlock(&console_owner_lock); /* * If there is an active printk() writing to the * consoles, instead of having it write our data too, * see if we can offload that load from the active * printer, and do some printing ourselves. * Go into a spin only if there isn't already a waiter * spinning, and there is an active printer, and * that active printer isn't us (recursive printk?). */ if (!spin) { printk_safe_exit_irqrestore(flags); return 0; } /* We spin waiting for the owner to release us */ spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); /* Owner will clear console_waiter on hand off */ while (READ_ONCE(console_waiter)) cpu_relax(); spin_release(&console_owner_dep_map, _THIS_IP_); printk_safe_exit_irqrestore(flags); /* * The owner passed the console lock to us. * Since we did not spin on console lock, annotate * this as a trylock. Otherwise lockdep will * complain. */ mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); /* * Update @console_may_schedule for trylock because the previous * owner may have been schedulable. */ console_may_schedule = 0; return 1; } /* * Recursion is tracked separately on each CPU. If NMIs are supported, an * additional NMI context per CPU is also separately tracked. Until per-CPU * is available, a separate "early tracking" is performed. */ static DEFINE_PER_CPU(u8, printk_count); static u8 printk_count_early; #ifdef CONFIG_HAVE_NMI static DEFINE_PER_CPU(u8, printk_count_nmi); static u8 printk_count_nmi_early; #endif /* * Recursion is limited to keep the output sane. printk() should not require * more than 1 level of recursion (allowing, for example, printk() to trigger * a WARN), but a higher value is used in case some printk-internal errors * exist, such as the ringbuffer validation checks failing. */ #define PRINTK_MAX_RECURSION 3 /* * Return a pointer to the dedicated counter for the CPU+context of the * caller. */ static u8 *__printk_recursion_counter(void) { #ifdef CONFIG_HAVE_NMI if (in_nmi()) { if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count_nmi); return &printk_count_nmi_early; } #endif if (printk_percpu_data_ready()) return this_cpu_ptr(&printk_count); return &printk_count_early; } /* * Enter recursion tracking. Interrupts are disabled to simplify tracking. * The caller must check the boolean return value to see if the recursion is * allowed. On failure, interrupts are not disabled. * * @recursion_ptr must be a variable of type (u8 *) and is the same variable * that is passed to printk_exit_irqrestore(). */ #define printk_enter_irqsave(recursion_ptr, flags) \ ({ \ bool success = true; \ \ typecheck(u8 *, recursion_ptr); \ local_irq_save(flags); \ (recursion_ptr) = __printk_recursion_counter(); \ if (*(recursion_ptr) > PRINTK_MAX_RECURSION) { \ local_irq_restore(flags); \ success = false; \ } else { \ (*(recursion_ptr))++; \ } \ success; \ }) /* Exit recursion tracking, restoring interrupts. */ #define printk_exit_irqrestore(recursion_ptr, flags) \ do { \ typecheck(u8 *, recursion_ptr); \ (*(recursion_ptr))--; \ local_irq_restore(flags); \ } while (0) int printk_delay_msec __read_mostly; static inline void printk_delay(int level) { boot_delay_msec(level); if (unlikely(printk_delay_msec)) { int m = printk_delay_msec; while (m--) { mdelay(1); touch_nmi_watchdog(); } } } static inline u32 printk_caller_id(void) { return in_task() ? task_pid_nr(current) : 0x80000000 + smp_processor_id(); } /** * printk_parse_prefix - Parse level and control flags. * * @text: The terminated text message. * @level: A pointer to the current level value, will be updated. * @flags: A pointer to the current printk_info flags, will be updated. * * @level may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @level must be set to * LOGLEVEL_DEFAULT in order to be updated with the parsed value. * * @flags may be NULL if the caller is not interested in the parsed value. * Otherwise the variable pointed to by @flags will be OR'd with the parsed * value. * * Return: The length of the parsed level and control flags. */ u16 printk_parse_prefix(const char *text, int *level, enum printk_info_flags *flags) { u16 prefix_len = 0; int kern_level; while (*text) { kern_level = printk_get_level(text); if (!kern_level) break; switch (kern_level) { case '0' ... '7': if (level && *level == LOGLEVEL_DEFAULT) *level = kern_level - '0'; break; case 'c': /* KERN_CONT */ if (flags) *flags |= LOG_CONT; } prefix_len += 2; text += 2; } return prefix_len; } __printf(5, 0) static u16 printk_sprint(char *text, u16 size, int facility, enum printk_info_flags *flags, const char *fmt, va_list args) { u16 text_len; text_len = vscnprintf(text, size, fmt, args); /* Mark and strip a trailing newline. */ if (text_len && text[text_len - 1] == '\n') { text_len--; *flags |= LOG_NEWLINE; } /* Strip log level and control flags. */ if (facility == 0) { u16 prefix_len; prefix_len = printk_parse_prefix(text, NULL, NULL); if (prefix_len) { text_len -= prefix_len; memmove(text, text + prefix_len, text_len); } } trace_console(text, text_len); return text_len; } __printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct prb_reserved_entry e; enum printk_info_flags flags = 0; struct printk_record r; unsigned long irqflags; u16 trunc_msg_len = 0; char prefix_buf[8]; u8 *recursion_ptr; u16 reserve_size; va_list args2; u32 caller_id; u16 text_len; int ret = 0; u64 ts_nsec; if (!printk_enter_irqsave(recursion_ptr, irqflags)) return 0; /* * Since the duration of printk() can vary depending on the message * and state of the ringbuffer, grab the timestamp now so that it is * close to the call of printk(). This provides a more deterministic * timestamp with respect to the caller. */ ts_nsec = local_clock(); caller_id = printk_caller_id(); /* * The sprintf needs to come first since the syslog prefix might be * passed in as a parameter. An extra byte must be reserved so that * later the vscnprintf() into the reserved buffer has room for the * terminating '\0', which is not counted by vsnprintf(). */ va_copy(args2, args); reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; va_end(args2); if (reserve_size > PRINTKRB_RECORD_MAX) reserve_size = PRINTKRB_RECORD_MAX; /* Extract log level or control flags. */ if (facility == 0) printk_parse_prefix(&prefix_buf[0], &level, &flags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dev_info) flags |= LOG_NEWLINE; if (is_printk_force_console()) flags |= LOG_FORCE_CON; if (flags & LOG_CONT) { prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) { text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, facility, &flags, fmt, args); r.info->text_len += text_len; if (flags & LOG_FORCE_CON) r.info->flags |= LOG_FORCE_CON; if (flags & LOG_NEWLINE) { r.info->flags |= LOG_NEWLINE; prb_final_commit(&e); } else { prb_commit(&e); } ret = text_len; goto out; } } /* * Explicitly initialize the record before every prb_reserve() call. * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ truncate_msg(&reserve_size, &trunc_msg_len); prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) goto out; } /* fill message */ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; r.info->facility = facility; r.info->level = level & 7; r.info->flags = flags & 0x1f; r.info->ts_nsec = ts_nsec; r.info->caller_id = caller_id; if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); /* A message without a trailing newline can be continued. */ if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); ret = text_len + trunc_msg_len; out: printk_exit_irqrestore(recursion_ptr, irqflags); return ret; } /* * This acts as a one-way switch to allow legacy consoles to print from * the printk() caller context on a panic CPU. It also attempts to flush * the legacy consoles in this context. */ void printk_legacy_allow_panic_sync(void) { struct console_flush_type ft; legacy_allow_panic_sync = true; printk_get_console_flush_type(&ft); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } bool __read_mostly debug_non_panic_cpus; #ifdef CONFIG_PRINTK_CALLER static int __init debug_non_panic_cpus_setup(char *str) { debug_non_panic_cpus = true; pr_info("allow messages from non-panic CPUs in panic()\n"); return 0; } early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup); module_param(debug_non_panic_cpus, bool, 0644); MODULE_PARM_DESC(debug_non_panic_cpus, "allow messages from non-panic CPUs in panic()"); #endif asmlinkage int vprintk_emit(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { struct console_flush_type ft; int printed_len; /* Suppress unimportant messages after panic happens */ if (unlikely(suppress_printk)) return 0; /* * The messages on the panic CPU are the most important. If * non-panic CPUs are generating any messages, they will be * silently dropped. */ if (other_cpu_in_panic() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; printk_get_console_flush_type(&ft); /* If called from the scheduler, we can not call up(). */ if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; ft.legacy_offload |= ft.legacy_direct; ft.legacy_direct = false; } printk_delay(level); printed_len = vprintk_store(facility, level, dev_info, fmt, args); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_direct) { /* * The caller may be holding system-critical or * timing-sensitive locks. Disable preemption during * printing of all remaining records to all consoles so that * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the * spinning variant, this context tries to take over the * printing from another printing context. */ if (console_trylock_spinning()) console_unlock(); preempt_enable(); } if (ft.legacy_offload) defer_console_output(); else wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); int vprintk_default(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } EXPORT_SYMBOL_GPL(vprintk_default); asmlinkage __visible int _printk(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk(fmt, args); va_end(args); return r; } EXPORT_SYMBOL(_printk); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ #define printk_time false #define prb_read_valid(rb, seq, r) false #define prb_first_valid_seq(rb) 0 #define prb_next_seq(rb) 0 static u64 syslog_seq; static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ #ifdef CONFIG_EARLY_PRINTK struct console *early_console; asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; char buf[512]; int n; if (!early_console) return; va_start(ap, fmt); n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); early_console->write(early_console, buf, n); } #endif static void set_user_specified(struct console_cmdline *c, bool user_specified) { if (!user_specified) return; /* * @c console was defined by the user on the command line. * Do not clear when added twice also by SPCR or the device tree. */ c->user_specified = true; /* At least one console defined by the user on the command line. */ console_set_on_cmdline = 1; } static int __add_preferred_console(const char *name, const short idx, const char *devname, char *options, char *brl_options, bool user_specified) { struct console_cmdline *c; int i; if (!name && !devname) return -EINVAL; /* * We use a signed short index for struct console for device drivers to * indicate a not yet assigned index or port. However, a negative index * value is not valid when the console name and index are defined on * the command line. */ if (name && idx < 0) return -EINVAL; /* * See if this tty is not yet registered, and * if we have a slot free. */ for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if ((name && strcmp(c->name, name) == 0 && c->index == idx) || (devname && strcmp(c->devname, devname) == 0)) { if (!brl_options) preferred_console = i; set_user_specified(c, user_specified); return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) preferred_console = i; if (name) strscpy(c->name, name); if (devname) strscpy(c->devname, devname); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); c->index = idx; return 0; } static int __init console_msg_format_setup(char *str) { if (!strcmp(str, "syslog")) console_msg_format = MSG_FORMAT_SYSLOG; if (!strcmp(str, "default")) console_msg_format = MSG_FORMAT_DEFAULT; return 1; } __setup("console_msg_format=", console_msg_format_setup); /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4); char buf[sizeof(console_cmdline[0].devname)]; char *brl_options = NULL; char *ttyname = NULL; char *devname = NULL; char *options; char *s; int idx; /* * console="" or console=null have been suggested as a way to * disable console output. Use ttynull that has been created * for exactly this purpose. */ if (str[0] == 0 || strcmp(str, "null") == 0) { __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true); return 1; } if (_braille_console_setup(&str, &brl_options)) return 1; /* For a DEVNAME:0.0 style console the character device is unknown early */ if (strchr(str, ':')) devname = buf; else ttyname = buf; /* * Decode str into name, index, options. */ if (ttyname && isdigit(str[0])) scnprintf(buf, sizeof(buf), "ttyS%s", str); else strscpy(buf, str); options = strchr(str, ','); if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) strscpy(buf, "ttyS0"); if (!strcmp(str, "ttyb")) strscpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) if ((ttyname && isdigit(*s)) || *s == ',') break; /* @idx will get defined when devname matches. */ if (devname) idx = -1; else idx = simple_strtoul(s, NULL, 10); *s = 0; __add_preferred_console(ttyname, idx, devname, options, brl_options, true); return 1; } __setup("console=", console_setup); /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name * @idx: device index * @options: options for this console * * The last preferred console added will be used for kernel messages * and stdin/out/err for init. Normally this is used by console_setup * above to handle user-supplied console arguments; however it can also * be used by arch-specific code either to override the user or more * commonly to provide a default console (ie from PROM variables) when * the user has not supplied one. */ int add_preferred_console(const char *name, const short idx, char *options) { return __add_preferred_console(name, idx, NULL, options, NULL, false); } /** * match_devname_and_update_preferred_console - Update a preferred console * when matching devname is found. * @devname: DEVNAME:0.0 style device name * @name: Name of the corresponding console driver, e.g. "ttyS" * @idx: Console index, e.g. port number. * * The function checks whether a device with the given @devname is * preferred via the console=DEVNAME:0.0 command line option. * It fills the missing console driver name and console index * so that a later register_console() call could find (match) * and enable this device. * * It might be used when a driver subsystem initializes particular * devices with already known DEVNAME:0.0 style names. And it * could predict which console driver name and index this device * would later get associated with. * * Return: 0 on success, negative error code on failure. */ int match_devname_and_update_preferred_console(const char *devname, const char *name, const short idx) { struct console_cmdline *c = console_cmdline; int i; if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0) return -EINVAL; for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { if (!strcmp(devname, c->devname)) { pr_info("associate the preferred console \"%s\" with \"%s%d\"\n", devname, name, idx); strscpy(c->name, name); c->index = idx; return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console); bool console_suspend_enabled = true; EXPORT_SYMBOL(console_suspend_enabled); static int __init console_suspend_disable(char *str) { console_suspend_enabled = false; return 1; } __setup("no_console_suspend", console_suspend_disable); module_param_named(console_suspend, console_suspend_enabled, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(console_suspend, "suspend console during suspend" " and hibernate operations"); static bool printk_console_no_auto_verbose; void console_verbose(void) { if (console_loglevel && !printk_console_no_auto_verbose) console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } EXPORT_SYMBOL_GPL(console_verbose); module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644); MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc"); /** * console_suspend_all - suspend the console subsystem * * This disables printk() while we go into suspend states */ void console_suspend_all(void) { struct console *con; if (!console_suspend_enabled) return; pr_info("Suspending console(s) (use no_console_suspend to debug)\n"); pr_flush(1000, true); console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags | CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see that they are suspended so that it * is guaranteed that all printing has stopped when this function * completes. */ synchronize_srcu(&console_srcu); } void console_resume_all(void) { struct console_flush_type ft; struct console *con; if (!console_suspend_enabled) return; console_list_lock(); for_each_console(con) console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All printing * contexts must be able to see they are no longer suspended so * that they are guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); pr_flush(1000, true); } /** * console_cpu_notify - print deferred console messages after CPU hotplug * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages * will be printed on the console only if there are CON_ANYTIME consoles. * This function is called when a new CPU comes online (or fails to come * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { struct console_flush_type ft; if (!cpuhp_tasks_frozen) { printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { if (console_trylock()) console_unlock(); } } return 0; } /** * console_lock - block the console subsystem from printing * * Acquires a lock which guarantees that no consoles will * be in or enter their write() callback. * * Can sleep, returns nothing. */ void console_lock(void) { might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ while (other_cpu_in_panic()) msleep(1000); down_console_sem(); console_locked = 1; console_may_schedule = 1; } EXPORT_SYMBOL(console_lock); /** * console_trylock - try to block the console subsystem from printing * * Try to acquire a lock which guarantees that no consoles will * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ if (other_cpu_in_panic()) return 0; if (down_trylock_console_sem()) return 0; console_locked = 1; console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); int is_console_locked(void) { return console_locked; } EXPORT_SYMBOL(is_console_locked); static void __console_unlock(void) { console_locked = 0; up_console_sem(); } #ifdef CONFIG_PRINTK /* * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting * the existing message over and inserting the scratchbuf message. * * @pmsg is the original printk message. * @fmt is the printf format of the message which will prepend the existing one. * * If there is not enough space in @pmsg->pbufs->outbuf, the existing * message text will be sufficiently truncated. * * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated. */ __printf(2, 3) static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; va_list args; size_t len; va_start(args, fmt); len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args); va_end(args); /* * Make sure outbuf is sufficiently large before prepending. * Keep at least the prefix when the message must be truncated. * It is a rather theoretical problem when someone tries to * use a minimalist buffer. */ if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz)) return; if (pmsg->outbuf_len + len >= outbuf_sz) { /* Truncate the message, but keep it terminated. */ pmsg->outbuf_len = outbuf_sz - (len + 1); outbuf[pmsg->outbuf_len] = 0; } memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1); memcpy(outbuf, scratchbuf, len); pmsg->outbuf_len += len; } /* * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. * * @dropped is the dropped count to report in the dropped message. */ void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped) { console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped); } /* * Prepend the message in @pmsg->pbufs->outbuf with a "replay message". * @pmsg->outbuf_len is updated appropriately. * * @pmsg is the printk message to prepend. */ void console_prepend_replay(struct printk_message *pmsg) { console_prepend_message(pmsg, "** replaying previous printk message **\n"); } /* * Read and format the specified record (or a later record if the specified * record is not available). * * @pmsg will contain the formatted result. @pmsg->pbufs must point to a * struct printk_buffers. * * @seq is the record to read and format. If it is not available, the next * valid record is read. * * @is_extended specifies if the message should be formatted for extended * console output. * * @may_supress specifies if records may be skipped based on loglevel. * * Returns false if no record is available. Otherwise true and all fields * of @pmsg are valid. (See the documentation of struct printk_message * for information about the @pmsg fields.) */ bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_suppress) { struct printk_buffers *pbufs = pmsg->pbufs; const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf); const size_t outbuf_sz = sizeof(pbufs->outbuf); char *scratchbuf = &pbufs->scratchbuf[0]; char *outbuf = &pbufs->outbuf[0]; struct printk_info info; struct printk_record r; size_t len = 0; bool force_con; /* * Formatting extended messages requires a separate buffer, so use the * scratch buffer to read in the ringbuffer text. * * Formatting normal messages is done in-place, so read the ringbuffer * text directly into the output buffer. */ if (is_extended) prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz); else prb_rec_init_rd(&r, &info, outbuf, outbuf_sz); if (!prb_read_valid(prb, seq, &r)) return false; pmsg->seq = r.info->seq; pmsg->dropped = r.info->seq - seq; force_con = r.info->flags & LOG_FORCE_CON; /* * Skip records that are not forced to be printed on consoles and that * has level above the console loglevel. */ if (!force_con && may_suppress && suppress_message_printing(r.info->level)) goto out; if (is_extended) { len = info_print_ext_header(outbuf, outbuf_sz, r.info); len += msg_print_ext_body(outbuf + len, outbuf_sz - len, &r.text_buf[0], r.info->text_len, &r.info->dev_info); } else { len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); } out: pmsg->outbuf_len = len; return true; } /* * Legacy console printing from printk() caller context does not respect * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a * false positive. For PREEMPT_RT the false positive condition does not * occur. * * This map is used to temporarily establish LD_WAIT_SLEEP context for the * console write() callback when legacy printing to avoid false positive * lockdep complaints, thus allowing lockdep to continue to function for * real issues. */ #ifdef CONFIG_PREEMPT_RT static inline void printk_legacy_allow_spinlock_enter(void) { } static inline void printk_legacy_allow_spinlock_exit(void) { } #else static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP); static inline void printk_legacy_allow_spinlock_enter(void) { lock_map_acquire_try(&printk_legacy_map); } static inline void printk_legacy_allow_spinlock_exit(void) { lock_map_release(&printk_legacy_map); } #endif /* CONFIG_PREEMPT_RT */ /* * Used as the printk buffers for non-panic, serialized console printing. * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles. * Its usage requires the console_lock held. */ struct printk_buffers printk_shared_pbufs; /* * Print one record for the given console. The record printed is whatever * record is the next available record for the given console. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding both the * console_lock and the SRCU read lock. Otherwise it is set to false. * * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED; char *outbuf = &printk_shared_pbufs.outbuf[0]; struct printk_message pmsg = { .pbufs = &printk_shared_pbufs, }; unsigned long flags; *handover = false; if (!printk_get_next_message(&pmsg, con->seq, is_extended, true)) return false; con->dropped += pmsg.dropped; /* Skip messages of formatted length 0. */ if (pmsg.outbuf_len == 0) { con->seq = pmsg.seq + 1; goto skip; } if (con->dropped && !is_extended) { console_prepend_dropped(&pmsg, con->dropped); con->dropped = 0; } /* Write everything out to the hardware. */ if (force_legacy_kthread() && !panic_in_progress()) { /* * With forced threading this function is in a task context * (either legacy kthread or get_init_console_seq()). There * is no need for concern about printk reentrance, handovers, * or lockdep complaints. */ con->write(con, outbuf, pmsg.outbuf_len); con->seq = pmsg.seq + 1; } else { /* * While actively printing out messages, if another printk() * were to occur on another CPU, it may wait for this one to * finish. This task can not be preempted if there is a * waiter waiting to take over. * * Interrupts are disabled because the hand over to a waiter * must not be interrupted until the hand over is completed * (@console_waiter is cleared). */ printk_safe_enter_irqsave(flags); console_lock_spinning_enable(); /* Do not trace print latency. */ stop_critical_timings(); printk_legacy_allow_spinlock_enter(); con->write(con, outbuf, pmsg.outbuf_len); printk_legacy_allow_spinlock_exit(); start_critical_timings(); con->seq = pmsg.seq + 1; *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); } skip: return true; } #else static bool console_emit_next_record(struct console *con, bool *handover, int cookie) { *handover = false; return false; } static inline void printk_kthreads_check_locked(void) { } #endif /* CONFIG_PRINTK */ /* * Print out all remaining records to all consoles. * * @do_cond_resched is set by the caller. It can be true only in schedulable * context. * * @next_seq is set to the sequence number after the last available record. * The value is valid only when this function returns true. It means that all * usable consoles are completely flushed. * * @handover will be set to true if a printk waiter has taken over the * console_lock, in which case the caller is no longer holding the * console_lock. Otherwise it is set to false. * * Returns true when there was at least one usable console and all messages * were flushed to all usable consoles. A returned false informs the caller * that everything was not flushed (either there were no usable consoles or * another context has taken over printing or it is a panic situation and this * is not the panic CPU). Regardless the reason, the caller should assume it * is not useful to immediately try again. * * Requires the console_lock. */ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover) { struct console_flush_type ft; bool any_usable = false; struct console *con; bool any_progress; int cookie; *next_seq = 0; *handover = false; do { any_progress = false; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; bool progress; /* * console_flush_all() is only responsible for nbcon * consoles when the nbcon consoles cannot print via * their atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, !do_cond_resched)) continue; any_usable = true; if (flags & CON_NBCON) { progress = nbcon_legacy_emit_next_record(con, handover, cookie, !do_cond_resched); printk_seq = nbcon_seq_read(con); } else { progress = console_emit_next_record(con, handover, cookie); printk_seq = con->seq; } /* * If a handover has occurred, the SRCU read lock * is already released. */ if (*handover) return false; /* Track the next of the highest seq flushed. */ if (printk_seq > *next_seq) *next_seq = printk_seq; if (!progress) continue; any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ if (other_cpu_in_panic()) goto abandon; if (do_cond_resched) cond_resched(); } console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; abandon: console_srcu_read_unlock(cookie); return false; } static void __console_flush_and_unlock(void) { bool do_cond_resched; bool handover; bool flushed; u64 next_seq; /* * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() * between lines if allowable. Not doing so can cause a very long * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. Therefore, create * a local to use for the printing loop. */ do_cond_resched = console_may_schedule; do { console_may_schedule = 0; flushed = console_flush_all(do_cond_resched, &next_seq, &handover); if (!handover) __console_unlock(); /* * Abort if there was a failure to flush all messages to all * usable consoles. Either it is not possible to flush (in * which case it would be an infinite loop of retrying) or * another context has taken over printing. */ if (!flushed) break; /* * Some context may have added new records after * console_flush_all() but before unlocking the console. * Re-check if there is a new record to flush. If the trylock * fails, another context is already handling the printing. */ } while (prb_read_valid(prb, next_seq, NULL) && console_trylock()); } /** * console_unlock - unblock the legacy console subsystem from printing * * Releases the console_lock which the caller holds to block printing of * the legacy console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock() emits the output on * legacy consoles prior to releasing the lock. * * console_unlock(); may be called from any context. */ void console_unlock(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (ft.legacy_direct) __console_flush_and_unlock(); else __console_unlock(); } EXPORT_SYMBOL(console_unlock); /** * console_conditional_schedule - yield the CPU if required * * If the console code is currently allowed to sleep, and * if this CPU should yield the CPU to another task, do * so here. * * Must be called within console_lock();. */ void __sched console_conditional_schedule(void) { if (console_may_schedule) cond_resched(); } EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { bool found_unblank = false; struct console *c; int cookie; /* * First check if there are any consoles implementing the unblank() * callback. If not, there is no reason to continue and take the * console lock, which in particular can be dangerous if * @oops_in_progress is set. */ cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) { found_unblank = true; break; } } console_srcu_read_unlock(cookie); if (!found_unblank) return; /* * Stop console printing because the unblank() callback may * assume the console is not within its write() callback. * * If @oops_in_progress is set, this may be an atomic context. * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { /* Semaphores are not NMI-safe. */ if (in_nmi()) return; /* * Attempting to trylock the console lock can deadlock * if another CPU was stopped while modifying the * semaphore. "Hope and pray" that this is not the * current situation. */ if (down_trylock_console_sem() != 0) return; } else console_lock(); console_locked = 1; console_may_schedule = 0; cookie = console_srcu_read_lock(); for_each_console_srcu(c) { short flags = console_srcu_read_flags(c); if (flags & CON_SUSPENDED) continue; if ((flags & CON_ENABLED) && c->unblank) c->unblank(); } console_srcu_read_unlock(cookie); console_unlock(); if (!oops_in_progress) pr_flush(1000, true); } /* * Rewind all consoles to the oldest available record. * * IMPORTANT: The function is safe only when called under * console_lock(). It is not enforced because * it is used as a best effort in panic(). */ static void __console_rewind_all(void) { struct console *c; short flags; int cookie; u64 seq; seq = prb_first_valid_seq(prb); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { flags = console_srcu_read_flags(c); if (flags & CON_NBCON) { nbcon_seq_force(c, seq); } else { /* * This assignment is safe only when called under * console_lock(). On panic, legacy consoles are * only best effort. */ c->seq = seq; } } console_srcu_read_unlock(cookie); } /** * console_flush_on_panic - flush console content on panic * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ void console_flush_on_panic(enum con_flush_mode mode) { struct console_flush_type ft; bool handover; u64 next_seq; /* * Ignore the console lock and flush out the messages. Attempting a * trylock would not be useful because: * * - if it is contended, it must be ignored anyway * - console_lock() and console_trylock() block and fail * respectively in panic for non-panic CPUs * - semaphores are not NMI-safe */ /* * If another context is holding the console lock, * @console_may_schedule might be set. Clear it so that * this context does not call cond_resched() while flushing. */ console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) __console_rewind_all(); printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); /* Flush legacy consoles once allowed, even when dangerous. */ if (legacy_allow_panic_sync) console_flush_all(false, &next_seq, &handover); } /* * Return the console tty driver structure and its associated index */ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; int cookie; /* * Take console_lock to serialize device() callback with * other console operations. For example, fg_console is * modified under console_lock when switching vt. */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (!c->device) continue; driver = c->device(c, index); if (driver) break; } console_srcu_read_unlock(cookie); console_unlock(); return driver; } /* * Prevent further output on the passed console device so that (for example) * serial drivers can suspend console output before suspending a port, and can * re-enable output afterwards. */ void console_suspend(struct console *console) { __pr_flush(console, 1000, true); console_list_lock(); console_srcu_write_flags(console, console->flags & ~CON_ENABLED); console_list_unlock(); /* * Ensure that all SRCU list walks have completed. All contexts must * be able to see that this console is disabled so that (for example) * the caller can suspend the port without risk of another context * using the port. */ synchronize_srcu(&console_srcu); } EXPORT_SYMBOL(console_suspend); void console_resume(struct console *console) { struct console_flush_type ft; bool is_nbcon; console_list_lock(); console_srcu_write_flags(console, console->flags | CON_ENABLED); is_nbcon = console->flags & CON_NBCON; console_list_unlock(); /* * Ensure that all SRCU list walks have completed. The related * printing context must be able to see it is enabled so that * it is guaranteed to wake up and resume printing. */ synchronize_srcu(&console_srcu); printk_get_console_flush_type(&ft); if (is_nbcon && ft.nbcon_offload) nbcon_kthread_wake(console); else if (ft.legacy_offload) defer_console_output(); __pr_flush(console, 1000, true); } EXPORT_SYMBOL(console_resume); #ifdef CONFIG_PRINTK static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ static bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; static bool legacy_kthread_should_wakeup(void) { struct console_flush_type ft; struct console *con; bool ret = false; int cookie; if (kthread_should_stop()) return true; printk_get_console_flush_type(&ft); cookie = console_srcu_read_lock(); for_each_console_srcu(con) { short flags = console_srcu_read_flags(con); u64 printk_seq; /* * The legacy printer thread is only responsible for nbcon * consoles when the nbcon consoles cannot print via their * atomic or threaded flushing. */ if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload)) continue; if (!console_is_usable(con, flags, false)) continue; if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(con); } else { /* * It is safe to read @seq because only this * thread context updates @seq. */ printk_seq = con->seq; } if (prb_read_valid(prb, printk_seq, NULL)) { ret = true; break; } } console_srcu_read_unlock(cookie); return ret; } static int legacy_kthread_func(void *unused) { for (;;) { wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup()); if (kthread_should_stop()) break; console_lock(); __console_flush_and_unlock(); } return 0; } static bool legacy_kthread_create(void) { struct task_struct *kt; lockdep_assert_console_list_lock_held(); kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy"); if (WARN_ON(IS_ERR(kt))) { pr_err("failed to start legacy printing thread\n"); return false; } printk_legacy_kthread = kt; /* * It is important that console printing threads are scheduled * shortly after a printk call and with generous runtime budgets. */ sched_set_normal(printk_legacy_kthread, -20); return true; } /** * printk_kthreads_shutdown - shutdown all threaded printers * * On system shutdown all threaded printers are stopped. This allows printk * to transition back to atomic printing, thus providing a robust mechanism * for the final shutdown/reboot messages to be output. */ static void printk_kthreads_shutdown(void) { struct console *con; console_list_lock(); if (printk_kthreads_running) { printk_kthreads_running = false; for_each_console(con) { if (con->flags & CON_NBCON) nbcon_kthread_stop(con); } /* * The threads may have been stopped while printing a * backlog. Flush any records left over. */ nbcon_atomic_flush_pending(); } console_list_unlock(); } static struct syscore_ops printk_syscore_ops = { .shutdown = printk_kthreads_shutdown, }; /* * If appropriate, start nbcon kthreads and set @printk_kthreads_running. * If any kthreads fail to start, those consoles are unregistered. * * Must be called under console_list_lock(). */ static void printk_kthreads_check_locked(void) { struct hlist_node *tmp; struct console *con; lockdep_assert_console_list_lock_held(); if (!printk_kthreads_ready) return; if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && !legacy_kthread_create()) { /* * All legacy consoles must be unregistered. If there * are any nbcon consoles, they will set up their own * kthread. */ hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_NBCON) continue; unregister_console_locked(con); } } } else if (printk_legacy_kthread) { kthread_stop(printk_legacy_kthread); printk_legacy_kthread = NULL; } /* * Printer threads cannot be started as long as any boot console is * registered because there is no way to synchronize the hardware * registers between boot console code and regular console code. * It can only be known that there will be no new boot consoles when * an nbcon console is registered. */ if (have_boot_console || !have_nbcon_console) { /* Clear flag in case all nbcon consoles unregistered. */ printk_kthreads_running = false; return; } if (printk_kthreads_running) return; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_NBCON)) continue; if (!nbcon_kthread_create(con)) unregister_console_locked(con); } printk_kthreads_running = true; } static int __init printk_set_kthreads_ready(void) { register_syscore_ops(&printk_syscore_ops); console_list_lock(); printk_kthreads_ready = true; printk_kthreads_check_locked(); console_list_unlock(); return 0; } early_initcall(printk_set_kthreads_ready); #endif /* CONFIG_PRINTK */ static int __read_mostly keep_bootcon; static int __init keep_bootcon_setup(char *str) { keep_bootcon = 1; pr_info("debug: skip boot console de-registration.\n"); return 0; } early_param("keep_bootcon", keep_bootcon_setup); static int console_call_setup(struct console *newcon, char *options) { int err; if (!newcon->setup) return 0; /* Synchronize with possible boot console. */ console_lock(); err = newcon->setup(newcon, options); console_unlock(); return err; } /* * This is called by register_console() to try to match * the newly registered console with any of the ones selected * by either the command line or add_preferred_console() and * setup/enable it. * * Care need to be taken with consoles that are statically * enabled such as netconsole */ static int try_enable_preferred_console(struct console *newcon, bool user_specified) { struct console_cmdline *c; int i, err; for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]); i++, c++) { /* Console not yet initialized? */ if (!c->name[0]) continue; if (c->user_specified != user_specified) continue; if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && newcon->index != c->index) continue; if (newcon->index < 0) newcon->index = c->index; if (_braille_register_console(newcon, c)) return 0; err = console_call_setup(newcon, c->options); if (err) return err; } newcon->flags |= CON_ENABLED; if (i == preferred_console) newcon->flags |= CON_CONSDEV; return 0; } /* * Some consoles, such as pstore and netconsole, can be enabled even * without matching. Accept the pre-enabled consoles only when match() * and setup() had a chance to be called. */ if (newcon->flags & CON_ENABLED && c->user_specified == user_specified) return 0; return -ENOENT; } /* Try to enable the console unconditionally */ static void try_enable_default_console(struct console *newcon) { if (newcon->index < 0) newcon->index = 0; if (console_call_setup(newcon, NULL) != 0) return; newcon->flags |= CON_ENABLED; if (newcon->device) newcon->flags |= CON_CONSDEV; } /* Return the starting sequence number for a newly registered console. */ static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered) { struct console *con; bool handover; u64 init_seq; if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) { /* Get a consistent copy of @syslog_seq. */ mutex_lock(&syslog_lock); init_seq = syslog_seq; mutex_unlock(&syslog_lock); } else { /* Begin with next message added to ringbuffer. */ init_seq = prb_next_seq(prb); /* * If any enabled boot consoles are due to be unregistered * shortly, some may not be caught up and may be the same * device as @newcon. Since it is not known which boot console * is the same device, flush all consoles and, if necessary, * start with the message of the enabled boot console that is * the furthest behind. */ if (bootcon_registered && !keep_bootcon) { /* * Hold the console_lock to stop console printing and * guarantee safe access to console->seq. */ console_lock(); /* * Flush all consoles and set the console to start at * the next unprinted sequence number. */ if (!console_flush_all(true, &init_seq, &handover)) { /* * Flushing failed. Just choose the lowest * sequence of the enabled boot consoles. */ /* * If there was a handover, this context no * longer holds the console_lock. */ if (handover) console_lock(); init_seq = prb_next_seq(prb); for_each_console(con) { u64 seq; if (!(con->flags & CON_BOOT) || !(con->flags & CON_ENABLED)) { continue; } if (con->flags & CON_NBCON) seq = nbcon_seq_read(con); else seq = con->seq; if (seq < init_seq) init_seq = seq; } } console_unlock(); } } return init_seq; } #define console_first() \ hlist_entry(console_list.first, struct console, node) static int unregister_console_locked(struct console *console); /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to * print any messages that were printed by the kernel before the * console driver was initialized. * * This can happen pretty early during the boot process (because of * early_printk) - sometimes before setup_arch() completes - be careful * of what kernel features are used - they may not be initialised yet. * * There are two types of consoles - bootconsoles (early_printk) and * "real" consoles (everything which is not a bootconsole) which are * handled differently. * - Any number of bootconsoles can be registered at any time. * - As soon as a "real" console is registered, all bootconsoles * will be unregistered automatically. * - Once a "real" console is registered, any attempt to register a * bootconsoles will be rejected */ void register_console(struct console *newcon) { bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic; bool bootcon_registered = false; bool realcon_registered = false; struct console *con; unsigned long flags; u64 init_seq; int err; console_list_lock(); for_each_console(con) { if (WARN(con == newcon, "console '%s%d' already registered\n", con->name, con->index)) { goto unlock; } if (con->flags & CON_BOOT) bootcon_registered = true; else realcon_registered = true; } /* Do not register boot consoles when there already is a real one. */ if ((newcon->flags & CON_BOOT) && realcon_registered) { pr_info("Too late to register bootconsole %s%d\n", newcon->name, newcon->index); goto unlock; } if (newcon->flags & CON_NBCON) { /* * Ensure the nbcon console buffers can be allocated * before modifying any global data. */ if (!nbcon_alloc(newcon)) goto unlock; } /* * See if we want to enable this console driver by default. * * Nope when a console is preferred by the command line, device * tree, or SPCR. * * The first real console with tty binding (driver) wins. More * consoles might get enabled before the right one is found. * * Note that a console with tty binding will have CON_CONSDEV * flag set and will be first in the list. */ if (preferred_console < 0) { if (hlist_empty(&console_list) || !console_first()->device || console_first()->flags & CON_BOOT) { try_enable_default_console(newcon); } } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) { if (newcon->flags & CON_NBCON) nbcon_free(newcon); goto unlock; } /* * If we have a bootconsole, and are switching to a real console, * don't print everything out again, since when the boot console, and * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; } newcon->dropped = 0; init_seq = get_init_console_seq(newcon, bootcon_registered); if (newcon->flags & CON_NBCON) { have_nbcon_console = true; nbcon_seq_force(newcon, init_seq); } else { have_legacy_console = true; newcon->seq = init_seq; } if (newcon->flags & CON_BOOT) have_boot_console = true; /* * If another context is actively using the hardware of this new * console, it will not be aware of the nbcon synchronization. This * is a risk that two contexts could access the hardware * simultaneously if this new console is used for atomic printing * and the other context is still using the hardware. * * Use the driver synchronization to ensure that the hardware is not * in use while this new console transitions to being registered. */ if (use_device_lock) newcon->device_lock(newcon, &flags); /* * Put this console in the list - keep the * preferred driver at the head of the list. */ if (hlist_empty(&console_list)) { /* Ensure CON_CONSDEV is always set for the head. */ newcon->flags |= CON_CONSDEV; hlist_add_head_rcu(&newcon->node, &console_list); } else if (newcon->flags & CON_CONSDEV) { /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV); hlist_add_head_rcu(&newcon->node, &console_list); } else { hlist_add_behind_rcu(&newcon->node, console_list.first); } /* * No need to synchronize SRCU here! The caller does not rely * on all contexts being able to see the new console before * register_console() completes. */ /* This new console is now registered. */ if (use_device_lock) newcon->device_unlock(newcon, flags); console_sysfs_notify(); /* * By unregistering the bootconsoles after we enable the real console * we get the "console xxx enabled" message on all the consoles - * boot consoles, real consoles, etc - this is to ensure that end * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ con_printk(KERN_INFO, newcon, "enabled\n"); if (bootcon_registered && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { struct hlist_node *tmp; hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (con->flags & CON_BOOT) unregister_console_locked(con); } } /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); unlock: console_list_unlock(); } EXPORT_SYMBOL(register_console); /* Must be called under console_list_lock(). */ static int unregister_console_locked(struct console *console) { bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic; bool found_legacy_con = false; bool found_nbcon_con = false; bool found_boot_con = false; unsigned long flags; struct console *c; int res; lockdep_assert_console_list_lock_held(); con_printk(KERN_INFO, console, "disabled\n"); res = _braille_unregister_console(console); if (res < 0) return res; if (res > 0) return 0; if (!console_is_registered_locked(console)) res = -ENODEV; else if (console_is_usable(console, console->flags, true)) __pr_flush(console, 1000, true); /* Disable it unconditionally */ console_srcu_write_flags(console, console->flags & ~CON_ENABLED); if (res < 0) return res; /* * Use the driver synchronization to ensure that the hardware is not * in use while this console transitions to being unregistered. */ if (use_device_lock) console->device_lock(console, &flags); hlist_del_init_rcu(&console->node); if (use_device_lock) console->device_unlock(console, flags); /* * <HISTORICAL> * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. * </HISTORICAL> * * The above makes no sense as there is no guarantee that the next * console has any device attached. Oh well.... */ if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV) console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV); /* * Ensure that all SRCU list walks have completed. All contexts * must not be able to see this console in the list so that any * exit/cleanup routines can be performed safely. */ synchronize_srcu(&console_srcu); if (console->flags & CON_NBCON) nbcon_free(console); console_sysfs_notify(); if (console->exit) res = console->exit(console); /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. */ for_each_console(c) { if (c->flags & CON_BOOT) found_boot_con = true; if (c->flags & CON_NBCON) found_nbcon_con = true; else found_legacy_con = true; } if (!found_boot_con) have_boot_console = found_boot_con; if (!found_legacy_con) have_legacy_console = found_legacy_con; if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); return res; } int unregister_console(struct console *console) { int res; console_list_lock(); res = unregister_console_locked(console); console_list_unlock(); return res; } EXPORT_SYMBOL(unregister_console); /** * console_force_preferred_locked - force a registered console preferred * @con: The registered console to force preferred. * * Must be called under console_list_lock(). */ void console_force_preferred_locked(struct console *con) { struct console *cur_pref_con; if (!console_is_registered_locked(con)) return; cur_pref_con = console_first(); /* Already preferred? */ if (cur_pref_con == con) return; /* * Delete, but do not re-initialize the entry. This allows the console * to continue to appear registered (via any hlist_unhashed_lockless() * checks), even though it was briefly removed from the console list. */ hlist_del_rcu(&con->node); /* * Ensure that all SRCU list walks have completed so that the console * can be added to the beginning of the console list and its forward * list pointer can be re-initialized. */ synchronize_srcu(&console_srcu); con->flags |= CON_CONSDEV; WARN_ON(!con->device); /* Only the new head can have CON_CONSDEV set. */ console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV); hlist_add_head_rcu(&con->node, &console_list); } EXPORT_SYMBOL(console_force_preferred_locked); /* * Initialize the console device. This is called *early*, so * we can't necessarily depend on lots of kernel help here. * Just do some early initializations, and do the complex setup * later. */ void __init console_init(void) { int ret; initcall_t call; initcall_entry_t *ce; #ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE if (!console_set_on_cmdline) add_preferred_console("ttynull", 0, NULL); #endif /* Setup the default TTY line discipline. */ n_tty_init(); /* * set up the console device so that later boot sequences can * inform about problems etc.. */ ce = __con_initcall_start; trace_initcall_level("console"); while (ce < __con_initcall_end) { call = initcall_from_entry(ce); trace_initcall_start(call); ret = call(); trace_initcall_finish(call, ret); ce++; } } /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code * will access this data, unregister the boot consoles in a late initcall. * * If for some reason, such as deferred probe or the driver being a loadable * module, the real console hasn't registered yet at this point, there will * be a brief interval in which no messages are logged to the console, which * makes it difficult to diagnose problems that occur during this time. * * To mitigate this problem somewhat, only unregister consoles whose memory * intersects with the init section. Note that all other boot consoles will * get unregistered when the real preferred console is registered. */ static int __init printk_late_init(void) { struct hlist_node *tmp; struct console *con; int ret; console_list_lock(); hlist_for_each_entry_safe(con, tmp, &console_list, node) { if (!(con->flags & CON_BOOT)) continue; /* Check addresses that might be used for enabled consoles. */ if (init_section_intersects(con, sizeof(*con)) || init_section_contains(con->write, 0) || init_section_contains(con->read, 0) || init_section_contains(con->device, 0) || init_section_contains(con->unblank, 0) || init_section_contains(con->data, 0)) { /* * Please, consider moving the reported consoles out * of the init section. */ pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n", con->name, con->index); unregister_console_locked(con); } } console_list_unlock(); ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); printk_sysctl_init(); return 0; } late_initcall(printk_late_init); #if defined CONFIG_PRINTK /* If @con is specified, only wait for that console. Otherwise wait for all. */ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); unsigned long remaining_jiffies = timeout_jiffies; struct console_flush_type ft; struct console *c; u64 last_diff = 0; u64 printk_seq; short flags; int cookie; u64 diff; u64 seq; /* Sorry, pr_flush() will not work this early. */ if (system_state < SYSTEM_SCHEDULING) return false; might_sleep(); seq = prb_next_reserve_seq(prb); /* Flush the consoles so that records up to @seq are printed. */ printk_get_console_flush_type(&ft); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.legacy_direct) { console_lock(); console_unlock(); } for (;;) { unsigned long begin_jiffies; unsigned long slept_jiffies; diff = 0; /* * Hold the console_lock to guarantee safe access to * console->seq. Releasing console_lock flushes more * records in case @seq is still not printed on all * usable consoles. * * Holding the console_lock is not necessary if there * are no legacy or boot consoles. However, such a * console could register at any time. Always hold the * console_lock as a precaution rather than * synchronizing against register_console(). */ console_lock(); cookie = console_srcu_read_lock(); for_each_console_srcu(c) { if (con && con != c) continue; flags = console_srcu_read_flags(c); /* * If consoles are not usable, it cannot be expected * that they make forward progress, so only increment * @diff for usable consoles. */ if (!console_is_usable(c, flags, true) && !console_is_usable(c, flags, false)) { continue; } if (flags & CON_NBCON) { printk_seq = nbcon_seq_read(c); } else { printk_seq = c->seq; } if (printk_seq < seq) diff += seq - printk_seq; } console_srcu_read_unlock(cookie); if (diff != last_diff && reset_on_progress) remaining_jiffies = timeout_jiffies; console_unlock(); /* Note: @diff is 0 if there are no usable consoles. */ if (diff == 0 || remaining_jiffies == 0) break; /* msleep(1) might sleep much longer. Check time by jiffies. */ begin_jiffies = jiffies; msleep(1); slept_jiffies = jiffies - begin_jiffies; remaining_jiffies -= min(slept_jiffies, remaining_jiffies); last_diff = diff; } return (diff == 0); } /** * pr_flush() - Wait for printing threads to catch up. * * @timeout_ms: The maximum time (in ms) to wait. * @reset_on_progress: Reset the timeout if forward progress is seen. * * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 * represents infinite waiting. * * If @reset_on_progress is true, the timeout will be reset whenever any * printer has been seen to make some forward progress. * * Context: Process context. May sleep while acquiring console lock. * Return: true if all usable printers are caught up. */ bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } /* * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 #define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = this_cpu_xchg(printk_pending, 0); if (pending & PRINTK_PENDING_OUTPUT) { if (force_legacy_kthread()) { if (printk_legacy_kthread) wake_up_interruptible(&legacy_wait); } else { if (console_trylock()) console_unlock(); } } if (pending & PRINTK_PENDING_WAKEUP) wake_up_interruptible(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); static void __wake_up_klogd(int val) { if (!printk_percpu_data_ready()) return; preempt_disable(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. * * The full memory barrier within wq_has_sleeper() pairs with the full * memory barrier within set_current_state() of * prepare_to_wait_event(), which is called after ___wait_event() adds * the waiter but before it has checked the wait condition. * * This pairs with devkmsg_read:A and syslog_print:A. */ if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */ (val & PRINTK_PENDING_OUTPUT)) { this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } preempt_enable(); } /** * wake_up_klogd - Wake kernel logging daemon * * Use this function when new records have been added to the ringbuffer * and the console printing of those records has already occurred or is * known to be handled by some other context. This function will only * wake the logging daemon. * * Context: Any context. */ void wake_up_klogd(void) { __wake_up_klogd(PRINTK_PENDING_WAKEUP); } /** * defer_console_output - Wake kernel logging daemon and trigger * console printing in a deferred context * * Use this function when new records have been added to the ringbuffer, * this context is responsible for console printing those records, but * the current context is not allowed to perform the console printing. * Trigger an irq_work context to perform the console printing. This * function also wakes the logging daemon. * * Context: Any context. */ void defer_console_output(void) { /* * New messages may have been added directly to the ringbuffer * using vprintk_store(), so wake any waiters as well. */ __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT); } void printk_trigger_flush(void) { defer_console_output(); } int vprintk_deferred(const char *fmt, va_list args) { return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); } int _printk_deferred(const char *fmt, ...) { va_list args; int r; va_start(args, fmt); r = vprintk_deferred(fmt, args); va_end(args); return r; } /* * printk rate limiting, lifted from the networking subsystem. * * This enforces a rate limit: not more than 10 kernel messages * every 5s to make a denial-of-service attack impossible. */ DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); int __printk_ratelimit(const char *func) { return ___ratelimit(&printk_ratelimit_state, func); } EXPORT_SYMBOL(__printk_ratelimit); /** * printk_timed_ratelimit - caller-controlled printk ratelimiting * @caller_jiffies: pointer to caller's state * @interval_msecs: minimum interval between prints * * printk_timed_ratelimit() returns true if more than @interval_msecs * milliseconds have elapsed since the last time printk_timed_ratelimit() * returned true. */ bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { unsigned long elapsed = jiffies - *caller_jiffies; if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) return false; *caller_jiffies = jiffies; return true; } EXPORT_SYMBOL(printk_timed_ratelimit); static DEFINE_SPINLOCK(dump_list_lock); static LIST_HEAD(dump_list); /** * kmsg_dump_register - register a kernel log dumper. * @dumper: pointer to the kmsg_dumper structure * * Adds a kernel log dumper to the system. The dump callback in the * structure will be called when the kernel oopses or panics and must be * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. */ int kmsg_dump_register(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EBUSY; /* The dump callback needs to be set */ if (!dumper->dump) return -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_register); /** * kmsg_dump_unregister - unregister a kmsg dumper. * @dumper: pointer to the kmsg_dumper structure * * Removes a dump device from the system. Returns zero on success and * %-EINVAL otherwise. */ int kmsg_dump_unregister(struct kmsg_dumper *dumper) { unsigned long flags; int err = -EINVAL; spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); static bool always_kmsg_dump; module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) { switch (reason) { case KMSG_DUMP_PANIC: return "Panic"; case KMSG_DUMP_OOPS: return "Oops"; case KMSG_DUMP_EMERG: return "Emergency"; case KMSG_DUMP_SHUTDOWN: return "Shutdown"; default: return "Unknown"; } } EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); /** * kmsg_dump_desc - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * @desc: a short string to describe what caused the panic or oops. Can be NULL * if no additional description is available. * * Call each of the registered dumper's dump() callback, which can * retrieve the kmsg records with kmsg_dump_get_line() or * kmsg_dump_get_buffer(). */ void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { struct kmsg_dumper *dumper; struct kmsg_dump_detail detail = { .reason = reason, .description = desc}; rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { enum kmsg_dump_reason max_reason = dumper->max_reason; /* * If client has not provided a specific max_reason, default * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set. */ if (max_reason == KMSG_DUMP_UNDEF) { max_reason = always_kmsg_dump ? KMSG_DUMP_MAX : KMSG_DUMP_OOPS; } if (reason > max_reason) continue; /* invoke dumper which will iterate over records */ dumper->dump(dumper, &detail); } rcu_read_unlock(); } /** * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * * Start at the beginning of the kmsg buffer, with the oldest kmsg * record, and copy one record into the provided buffer. * * Consecutive calls will return the next available record moving * towards the end of the buffer with the youngest messages. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; unsigned int line_count; struct printk_record r; size_t l = 0; bool ret = false; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; prb_rec_init_rd(&r, &info, line, size); /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } l = get_record_print_text_size(&info, line_count, syslog, printk_time); } iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len_out: length of line placed into buffer * * Start at the end of the kmsg buffer and fill the provided buffer * with as many of the *youngest* kmsg records that fit into it. * If the buffer is large enough, all available kmsg records will be * copied with a single call. * * Consecutive calls will fill the buffer with the next block of * available older records, not including the earlier retrieved ones. * * A return value of FALSE indicates that there are no more records to * read. */ bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out) { u64 min_seq = latched_seq_read_nolock(&clear_seq); struct printk_info info; struct printk_record r; u64 seq; u64 next_seq; size_t len = 0; bool ret = false; bool time = printk_time; if (!buf || !size) goto out; if (iter->cur_seq < min_seq) iter->cur_seq = min_seq; if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ iter->cur_seq = info.seq; } } /* last entry */ if (iter->cur_seq >= iter->next_seq) goto out; /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. Pass in size-1 * because this function (by way of record_print_text()) will * not write more than size-1 bytes of text into @buf. */ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, size - 1, syslog, time); /* * Next kmsg_dump_get_buffer() invocation will dump block of * older records stored right before this one. */ next_seq = seq; prb_rec_init_rd(&r, &info, buf, size); prb_for_each_record(seq, prb, seq, &r) { if (r.info->seq >= iter->next_seq) break; len += record_print_text(&r, syslog, time); /* Adjust record to store to remaining buffer space. */ prb_rec_init_rd(&r, &info, buf + len, size - len); } iter->next_seq = next_seq; ret = true; out: if (len_out) *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** * kmsg_dump_rewind - reset the iterator * @iter: kmsg dump iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. */ void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { iter->cur_seq = latched_seq_read_nolock(&clear_seq); iter->next_seq = prb_next_seq(prb); } EXPORT_SYMBOL_GPL(kmsg_dump_rewind); /** * console_try_replay_all - try to replay kernel log on consoles * * Try to obtain lock on console subsystem and replay all * available records in printk buffer on the consoles. * Does nothing if lock is not obtained. * * Context: Any, except for NMI. */ void console_try_replay_all(void) { struct console_flush_type ft; printk_get_console_flush_type(&ft); if (console_trylock()) { __console_rewind_all(); if (ft.nbcon_atomic) nbcon_atomic_flush_pending(); if (ft.nbcon_offload) nbcon_kthreads_wake(); if (ft.legacy_offload) defer_console_output(); /* Consoles are flushed as part of console_unlock(). */ console_unlock(); } } #endif #ifdef CONFIG_SMP static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1); static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0); bool is_printk_cpu_sync_owner(void) { return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id()); } /** * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant * spinning lock is not owned by any CPU. * * Context: Any context. */ void __printk_cpu_sync_wait(void) { do { cpu_relax(); } while (atomic_read(&printk_cpu_sync_owner) != -1); } EXPORT_SYMBOL(__printk_cpu_sync_wait); /** * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant * spinning lock. * * If no processor has the lock, the calling processor takes the lock and * becomes the owner. If the calling processor is already the owner of the * lock, this function succeeds immediately. * * Context: Any context. Expects interrupts to be disabled. * Return: 1 on success, otherwise 0. */ int __printk_cpu_sync_try_get(void) { int cpu; int old; cpu = smp_processor_id(); /* * Guarantee loads and stores from this CPU when it is the lock owner * are _not_ visible to the previous lock owner. This pairs with * __printk_cpu_sync_put:B. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_put:A can never read from * __printk_cpu_sync_try_get:B. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of the previous CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of this CPU */ old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1, cpu); /* LMM(__printk_cpu_sync_try_get:A) */ if (old == -1) { /* * This CPU is now the owner and begins loading/storing * data: LMM(__printk_cpu_sync_try_get:B) */ return 1; } else if (old == cpu) { /* This CPU is already the owner. */ atomic_inc(&printk_cpu_sync_nested); return 1; } return 0; } EXPORT_SYMBOL(__printk_cpu_sync_try_get); /** * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock. * * The calling processor must be the owner of the lock. * * Context: Any context. Expects interrupts to be disabled. */ void __printk_cpu_sync_put(void) { if (atomic_read(&printk_cpu_sync_nested)) { atomic_dec(&printk_cpu_sync_nested); return; } /* * This CPU is finished loading/storing data: * LMM(__printk_cpu_sync_put:A) */ /* * Guarantee loads and stores from this CPU when it was the * lock owner are visible to the next lock owner. This pairs * with __printk_cpu_sync_try_get:A. * * Memory barrier involvement: * * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B, * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A. * * Relies on: * * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B * of this CPU * matching * ACQUIRE from __printk_cpu_sync_try_get:A to * __printk_cpu_sync_try_get:B of the next CPU */ atomic_set_release(&printk_cpu_sync_owner, -1); /* LMM(__printk_cpu_sync_put:B) */ } EXPORT_SYMBOL(__printk_cpu_sync_put); #endif /* CONFIG_SMP */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * include/uapi/linux/tipc_config.h: Header for TIPC configuration interface * * Copyright (c) 2003-2006, Ericsson AB * Copyright (c) 2005-2007, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUX_TIPC_CONFIG_H_ #define _LINUX_TIPC_CONFIG_H_ #include <linux/types.h> #include <linux/string.h> #include <linux/tipc.h> #include <asm/byteorder.h> /* * Configuration * * All configuration management messaging involves sending a request message * to the TIPC configuration service on a node, which sends a reply message * back. (In the future multi-message replies may be supported.) * * Both request and reply messages consist of a transport header and payload. * The transport header contains info about the desired operation; * the payload consists of zero or more type/length/value (TLV) items * which specify parameters or results for the operation. * * For many operations, the request and reply messages have a fixed number * of TLVs (usually zero or one); however, some reply messages may return * a variable number of TLVs. A failed request is denoted by the presence * of an "error string" TLV in the reply message instead of the TLV(s) the * reply should contain if the request succeeds. */ /* * Public commands: * May be issued by any process. * Accepted by own node, or by remote node only if remote management enabled. */ #define TIPC_CMD_NOOP 0x0000 /* tx none, rx none */ #define TIPC_CMD_GET_NODES 0x0001 /* tx net_addr, rx node_info(s) */ #define TIPC_CMD_GET_MEDIA_NAMES 0x0002 /* tx none, rx media_name(s) */ #define TIPC_CMD_GET_BEARER_NAMES 0x0003 /* tx none, rx bearer_name(s) */ #define TIPC_CMD_GET_LINKS 0x0004 /* tx net_addr, rx link_info(s) */ #define TIPC_CMD_SHOW_NAME_TABLE 0x0005 /* tx name_tbl_query, rx ultra_string */ #define TIPC_CMD_SHOW_PORTS 0x0006 /* tx none, rx ultra_string */ #define TIPC_CMD_SHOW_LINK_STATS 0x000B /* tx link_name, rx ultra_string */ #define TIPC_CMD_SHOW_STATS 0x000F /* tx unsigned, rx ultra_string */ /* * Protected commands: * May only be issued by "network administration capable" process. * Accepted by own node, or by remote node only if remote management enabled * and this node is zone manager. */ #define TIPC_CMD_GET_REMOTE_MNG 0x4003 /* tx none, rx unsigned */ #define TIPC_CMD_GET_MAX_PORTS 0x4004 /* tx none, rx unsigned */ #define TIPC_CMD_GET_MAX_PUBL 0x4005 /* obsoleted */ #define TIPC_CMD_GET_MAX_SUBSCR 0x4006 /* obsoleted */ #define TIPC_CMD_GET_MAX_ZONES 0x4007 /* obsoleted */ #define TIPC_CMD_GET_MAX_CLUSTERS 0x4008 /* obsoleted */ #define TIPC_CMD_GET_MAX_NODES 0x4009 /* obsoleted */ #define TIPC_CMD_GET_MAX_SLAVES 0x400A /* obsoleted */ #define TIPC_CMD_GET_NETID 0x400B /* tx none, rx unsigned */ #define TIPC_CMD_ENABLE_BEARER 0x4101 /* tx bearer_config, rx none */ #define TIPC_CMD_DISABLE_BEARER 0x4102 /* tx bearer_name, rx none */ #define TIPC_CMD_SET_LINK_TOL 0x4107 /* tx link_config, rx none */ #define TIPC_CMD_SET_LINK_PRI 0x4108 /* tx link_config, rx none */ #define TIPC_CMD_SET_LINK_WINDOW 0x4109 /* tx link_config, rx none */ #define TIPC_CMD_SET_LOG_SIZE 0x410A /* obsoleted */ #define TIPC_CMD_DUMP_LOG 0x410B /* obsoleted */ #define TIPC_CMD_RESET_LINK_STATS 0x410C /* tx link_name, rx none */ /* * Private commands: * May only be issued by "network administration capable" process. * Accepted by own node only; cannot be used on a remote node. */ #define TIPC_CMD_SET_NODE_ADDR 0x8001 /* tx net_addr, rx none */ #define TIPC_CMD_SET_REMOTE_MNG 0x8003 /* tx unsigned, rx none */ #define TIPC_CMD_SET_MAX_PORTS 0x8004 /* tx unsigned, rx none */ #define TIPC_CMD_SET_MAX_PUBL 0x8005 /* obsoleted */ #define TIPC_CMD_SET_MAX_SUBSCR 0x8006 /* obsoleted */ #define TIPC_CMD_SET_MAX_ZONES 0x8007 /* obsoleted */ #define TIPC_CMD_SET_MAX_CLUSTERS 0x8008 /* obsoleted */ #define TIPC_CMD_SET_MAX_NODES 0x8009 /* obsoleted */ #define TIPC_CMD_SET_MAX_SLAVES 0x800A /* obsoleted */ #define TIPC_CMD_SET_NETID 0x800B /* tx unsigned, rx none */ /* * Reserved commands: * May not be issued by any process. * Used internally by TIPC. */ #define TIPC_CMD_NOT_NET_ADMIN 0xC001 /* tx none, rx none */ /* * TLV types defined for TIPC */ #define TIPC_TLV_NONE 0 /* no TLV present */ #define TIPC_TLV_VOID 1 /* empty TLV (0 data bytes)*/ #define TIPC_TLV_UNSIGNED 2 /* 32-bit integer */ #define TIPC_TLV_STRING 3 /* char[128] (max) */ #define TIPC_TLV_LARGE_STRING 4 /* char[2048] (max) */ #define TIPC_TLV_ULTRA_STRING 5 /* char[32768] (max) */ #define TIPC_TLV_ERROR_STRING 16 /* char[128] containing "error code" */ #define TIPC_TLV_NET_ADDR 17 /* 32-bit integer denoting <Z.C.N> */ #define TIPC_TLV_MEDIA_NAME 18 /* char[TIPC_MAX_MEDIA_NAME] */ #define TIPC_TLV_BEARER_NAME 19 /* char[TIPC_MAX_BEARER_NAME] */ #define TIPC_TLV_LINK_NAME 20 /* char[TIPC_MAX_LINK_NAME] */ #define TIPC_TLV_NODE_INFO 21 /* struct tipc_node_info */ #define TIPC_TLV_LINK_INFO 22 /* struct tipc_link_info */ #define TIPC_TLV_BEARER_CONFIG 23 /* struct tipc_bearer_config */ #define TIPC_TLV_LINK_CONFIG 24 /* struct tipc_link_config */ #define TIPC_TLV_NAME_TBL_QUERY 25 /* struct tipc_name_table_query */ #define TIPC_TLV_PORT_REF 26 /* 32-bit port reference */ /* * Link priority limits (min, default, max, media default) */ #define TIPC_MIN_LINK_PRI 0 #define TIPC_DEF_LINK_PRI 10 #define TIPC_MAX_LINK_PRI 31 #define TIPC_MEDIA_LINK_PRI (TIPC_MAX_LINK_PRI + 1) /* * Link tolerance limits (min, default, max), in ms */ #define TIPC_MIN_LINK_TOL 50 #define TIPC_DEF_LINK_TOL 1500 #define TIPC_MAX_LINK_TOL 30000 #if (TIPC_MIN_LINK_TOL < 16) #error "TIPC_MIN_LINK_TOL is too small (abort limit may be NaN)" #endif /* * Link window limits (min, default, max), in packets */ #define TIPC_MIN_LINK_WIN 16 #define TIPC_DEF_LINK_WIN 50 #define TIPC_MAX_LINK_WIN 8191 /* * Default MTU for UDP media */ #define TIPC_DEF_LINK_UDP_MTU 14000 struct tipc_node_info { __be32 addr; /* network address of node */ __be32 up; /* 0=down, 1= up */ }; struct tipc_link_info { __be32 dest; /* network address of peer node */ __be32 up; /* 0=down, 1=up */ char str[TIPC_MAX_LINK_NAME]; /* link name */ }; struct tipc_bearer_config { __be32 priority; /* Range [1,31]. Override per link */ __be32 disc_domain; /* <Z.C.N> describing desired nodes */ char name[TIPC_MAX_BEARER_NAME]; }; struct tipc_link_config { __be32 value; char name[TIPC_MAX_LINK_NAME]; }; #define TIPC_NTQ_ALLTYPES 0x80000000 struct tipc_name_table_query { __be32 depth; /* 1:type, 2:+name info, 3:+port info, 4+:+debug info */ __be32 type; /* {t,l,u} info ignored if high bit of "depth" is set */ __be32 lowbound; /* (i.e. displays all entries of name table) */ __be32 upbound; }; /* * The error string TLV is a null-terminated string describing the cause * of the request failure. To simplify error processing (and to save space) * the first character of the string can be a special error code character * (lying by the range 0x80 to 0xFF) which represents a pre-defined reason. */ #define TIPC_CFG_TLV_ERROR "\x80" /* request contains incorrect TLV(s) */ #define TIPC_CFG_NOT_NET_ADMIN "\x81" /* must be network administrator */ #define TIPC_CFG_NOT_ZONE_MSTR "\x82" /* must be zone master */ #define TIPC_CFG_NO_REMOTE "\x83" /* remote management not enabled */ #define TIPC_CFG_NOT_SUPPORTED "\x84" /* request is not supported by TIPC */ #define TIPC_CFG_INVALID_VALUE "\x85" /* request has invalid argument value */ /* * A TLV consists of a descriptor, followed by the TLV value. * TLV descriptor fields are stored in network byte order; * TLV values must also be stored in network byte order (where applicable). * TLV descriptors must be aligned to addresses which are multiple of 4, * so up to 3 bytes of padding may exist at the end of the TLV value area. * There must not be any padding between the TLV descriptor and its value. */ struct tlv_desc { __be16 tlv_len; /* TLV length (descriptor + value) */ __be16 tlv_type; /* TLV identifier */ }; #define TLV_ALIGNTO 4 #define TLV_ALIGN(datalen) (((datalen)+(TLV_ALIGNTO-1)) & ~(TLV_ALIGNTO-1)) #define TLV_LENGTH(datalen) (sizeof(struct tlv_desc) + (datalen)) #define TLV_SPACE(datalen) (TLV_ALIGN(TLV_LENGTH(datalen))) #define TLV_DATA(tlv) ((void *)((char *)(tlv) + TLV_LENGTH(0))) static inline int TLV_OK(const void *tlv, __u16 space) { /* * Would also like to check that "tlv" is a multiple of 4, * but don't know how to do this in a portable way. * - Tried doing (!(tlv & (TLV_ALIGNTO-1))), but GCC compiler * won't allow binary "&" with a pointer. * - Tried casting "tlv" to integer type, but causes warning about size * mismatch when pointer is bigger than chosen type (int, long, ...). */ return (space >= TLV_SPACE(0)) && (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_len) <= space); } static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type) { return TLV_OK(tlv, space) && (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_type) == exp_type); } static inline int TLV_GET_LEN(struct tlv_desc *tlv) { return __be16_to_cpu(tlv->tlv_len); } static inline void TLV_SET_LEN(struct tlv_desc *tlv, __u16 len) { tlv->tlv_len = __cpu_to_be16(len); } static inline int TLV_CHECK_TYPE(struct tlv_desc *tlv, __u16 type) { return (__be16_to_cpu(tlv->tlv_type) == type); } static inline void TLV_SET_TYPE(struct tlv_desc *tlv, __u16 type) { tlv->tlv_type = __cpu_to_be16(type); } static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) { struct tlv_desc *tlv_ptr; int tlv_len; tlv_len = TLV_LENGTH(len); tlv_ptr = (struct tlv_desc *)tlv; tlv_ptr->tlv_type = __cpu_to_be16(type); tlv_ptr->tlv_len = __cpu_to_be16(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } /* * A TLV list descriptor simplifies processing of messages * containing multiple TLVs. */ struct tlv_list_desc { struct tlv_desc *tlv_ptr; /* ptr to current TLV */ __u32 tlv_space; /* # bytes from curr TLV to list end */ }; static inline void TLV_LIST_INIT(struct tlv_list_desc *list, void *data, __u32 space) { list->tlv_ptr = (struct tlv_desc *)data; list->tlv_space = space; } static inline int TLV_LIST_EMPTY(struct tlv_list_desc *list) { return (list->tlv_space == 0); } static inline int TLV_LIST_CHECK(struct tlv_list_desc *list, __u16 exp_type) { return TLV_CHECK(list->tlv_ptr, list->tlv_space, exp_type); } static inline void *TLV_LIST_DATA(struct tlv_list_desc *list) { return TLV_DATA(list->tlv_ptr); } static inline void TLV_LIST_STEP(struct tlv_list_desc *list) { __u16 tlv_space = TLV_ALIGN(__be16_to_cpu(list->tlv_ptr->tlv_len)); list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space); list->tlv_space -= tlv_space; } /* * Configuration messages exchanged via NETLINK_GENERIC use the following * family id, name, version and command. */ #define TIPC_GENL_NAME "TIPC" #define TIPC_GENL_VERSION 0x1 #define TIPC_GENL_CMD 0x1 /* * TIPC specific header used in NETLINK_GENERIC requests. */ struct tipc_genlmsghdr { __u32 dest; /* Destination address */ __u16 cmd; /* Command */ __u16 reserved; /* Unused */ }; #define TIPC_GENL_HDRLEN NLMSG_ALIGN(sizeof(struct tipc_genlmsghdr)) /* * Configuration messages exchanged via TIPC sockets use the TIPC configuration * message header, which is defined below. This structure is analogous * to the Netlink message header, but fields are stored in network byte order * and no padding is permitted between the header and the message data * that follows. */ struct tipc_cfg_msg_hdr { __be32 tcm_len; /* Message length (including header) */ __be16 tcm_type; /* Command type */ __be16 tcm_flags; /* Additional flags */ char tcm_reserved[8]; /* Unused */ }; #define TCM_F_REQUEST 0x1 /* Flag: Request message */ #define TCM_F_MORE 0x2 /* Flag: Message to be continued */ #define TCM_ALIGN(datalen) (((datalen)+3) & ~3) #define TCM_LENGTH(datalen) (sizeof(struct tipc_cfg_msg_hdr) + datalen) #define TCM_SPACE(datalen) (TCM_ALIGN(TCM_LENGTH(datalen))) #define TCM_DATA(tcm_hdr) ((void *)((char *)(tcm_hdr) + TCM_LENGTH(0))) static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, void *data, __u16 data_len) { struct tipc_cfg_msg_hdr *tcm_hdr; int msg_len; msg_len = TCM_LENGTH(data_len); tcm_hdr = (struct tipc_cfg_msg_hdr *)msg; tcm_hdr->tcm_len = __cpu_to_be32(msg_len); tcm_hdr->tcm_type = __cpu_to_be16(cmd); tcm_hdr->tcm_flags = __cpu_to_be16(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } #endif
2027 11 11 11 11 1 1 10 10 10 12 10 2 2 11 11 11 10 2 10 3 3 3 3 3 12 12 10 2 12 3 44 44 2003 2005 1997 1992 1993 1775 1718 1717 1724 1718 1718 1667 651 649 629 325 3 69 326 326 69 327 357 626 626 628 626 7 7 7 4 3 1916 2 2013 1875 1874 629 607 551 629 141 8 131 1698 1697 1703 1700 1699 1700 1700 1695 1699 1698 2 1697 73 1633 1838 1836 1827 1813 1843 6 4 2 1630 1629 6 6 6 298 1623 1626 128 29 125 126 126 126 128 128 76 69 69 72 3 69 69 1924 1864 1929 1931 1932 1927 147 147 1701 1700 1700 1697 1697 75 1705 629 629 628 627 627 628 629 629 628 629 628 627 586 586 133 134 133 66 77 699 564 626 44 44 44 44 44 8 36 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool __kernfs_active(struct kernfs_node *kn) { return atomic_read(&kn->active) >= 0; } static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return __kernfs_active(kn); } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (rcu_dereference(to->__parent) && to != from) { depth++; to = rcu_dereference(to->__parent); } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = rcu_dereference(a->__parent); da--; } while (db > da) { b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { b = rcu_dereference(b->__parent); a = rcu_dereference(a->__parent); } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is %NULL result will be "(null)" * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; ssize_t copied; int i, j; if (!kn_to) return strscpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strscpy(buf, "/", buflen); common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) { copied = strscpy(buf + len, parent_str, buflen - len); if (copied < 0) return copied; len += copied; } /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { const char *name; for (kn = kn_to, j = 0; j < i; j++) kn = rcu_dereference(kn->__parent); name = rcu_dereference(kn->name); len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strscpy(). * * Fills buffer with "(null)" if @kn is %NULL. * * Return: the resulting length of @buf. If @buf isn't long enough, * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { struct kernfs_node *kn_parent; if (!kn) return strscpy(buf, "(null)", buflen); guard(rcu)(); /* * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and * the parent is either existing or not. */ kn_parent = rcu_dereference(kn->__parent); return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Return: the length of the constructed path. If the path would have been * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { struct kernfs_root *root; guard(rcu)(); if (to) { root = kernfs_root(to); if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { guard(read_lock_irqsave)(&kernfs_rename_lock); return kernfs_path_from_node_locked(to, from, buf, buflen); } } return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { if (sz == -E2BIG) pr_cont("(name too long)"); else pr_cont("(error)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. * * Return: parent node of @kn */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); parent = kernfs_parent(kn); kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash - calculate hash of @ns + @name * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Return: 31-bit hash of ns + name (so it fits in an off_t) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * Return: * %0 on success, -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node *parent = NULL; struct kernfs_node *kn_parent; struct rb_node **node; kn_parent = kernfs_parent(kn); node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs++; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. * * Return: %true if @kn was actually removed, * %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { struct kernfs_node *kn_parent; if (RB_EMPTY_NODE(&kn->rb)) return false; kn_parent = kernfs_parent(kn); down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) kn_parent->dir.subdirs--; kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is %NULL. * * Return: * Pointer to @kn on success, %NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is %NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Multiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_root(kn)->kernfs_rwsem) __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); /* * Skip draining if already fully drained. This avoids draining and its * lockdep annotations for nodes which have never been activated * allowing embedding kernfs_remove() in create error paths without * worrying about draining. */ if (atomic_read(&kn->active) == KN_DEACTIVATED_BIAS && !kernfs_should_drain_open_files(kn)) return; up_write(&root->kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } if (kernfs_should_drain_open_files(kn)) kernfs_drain_open_files(kn); down_write(&root->kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); /* If the whole node goes away, then name can't be used outside */ kfree_const(rcu_access_pointer(kn->name)); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } kmem_cache_free(kernfs_node_cache, kn); } /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? rcu_dereference(parent->name) : "", rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree_rcu(root, rcu); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return: the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); rcu_assign_pointer(kn->__parent, parent); } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * Return: %NULL on failure, * otherwise a kernfs node with reference counter incremented. */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); rcu_read_lock(); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * We should fail if @kn has never been activated and guarantee success * if the caller knows that @kn is active. Both can be achieved by * __kernfs_active() which tests @kn->active without kernfs_rwsem. */ if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; rcu_read_unlock(); return kn; err_unlock: rcu_read_unlock(); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * Return: * %0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; struct kernfs_node *parent; bool has_ns; int ret; down_write(&root->kernfs_rwsem); parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ down_write(&root->kernfs_iattr_rwsem); ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&root->kernfs_iattr_rwsem); up_write(&root->kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { ssize_t len; char *p, *name; lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len < 0) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep. * * Return: pointer to the found kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root = kernfs_root(parent); down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&root->kernfs_rwsem); return kn; } unsigned int kernfs_root_flags(struct kernfs_node *kn) { return kernfs_root(kn)->flags; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Return: the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { /* * kernfs_remove holds kernfs_rwsem from the root so the root * shouldn't be freed during the operation. */ kernfs_get(root->kn); kernfs_remove(root->kn); kernfs_put(root->kn); /* will also free @root */ } /** * kernfs_root_to_node - return the kernfs_node associated with a kernfs_root * @root: root to use to lookup * * Return: @root's kernfs_node */ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root) { return root->kn; } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Return: the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn, *parent; struct kernfs_root *root; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { /* If the kernfs parent node has changed discard and * proceed to ->lookup. * * There's nothing special needed here when getting the * dentry parent, even if a concurrent rename is in * progress. That's because the dentry is negative so * it can only be the target of the rename and it will * be doing a d_move() not a replace. Consequently the * dentry d_parent won't change over the d_move(). * * Also kernfs negative dentries transitioning from * negative to positive during revalidate won't happen * because they are invalidated on containing directory * changes and the lookup re-done so that a new positive * dentry can be properly created. */ root = kernfs_root_from_sb(dentry->d_sb); down_read(&root->kernfs_rwsem); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); root = kernfs_root(kn); down_read(&root->kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; parent = kernfs_parent(kn); /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (parent && kernfs_ns_enabled(parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&root->kernfs_rwsem); return 1; out_bad: up_read(&root->kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&root->kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. * * Return: the next descendant to visit or %NULL when done. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return kernfs_parent(pos); } static void kernfs_activate_one(struct kernfs_node *kn) { lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); kn->flags |= KERNFS_ACTIVATED; if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) kernfs_activate_one(pos); up_write(&root->kernfs_rwsem); } /** * kernfs_show - show or hide a node * @kn: kernfs_node to show or hide * @show: whether to show or hide * * If @show is %false, @kn is marked hidden and deactivated. A hidden node is * ignored in future activaitons. If %true, the mark is removed and activation * state is restored. This function won't implicitly activate a new node in a * %KERNFS_ROOT_CREATE_DEACTIVATED root which hasn't been activated yet. * * To avoid recursion complexities, directories aren't supported for now. */ void kernfs_show(struct kernfs_node *kn, bool show) { struct kernfs_root *root = kernfs_root(kn); if (WARN_ON_ONCE(kernfs_type(kn) == KERNFS_DIR)) return; down_write(&root->kernfs_rwsem); if (show) { kn->flags &= ~KERNFS_HIDDEN; if (kn->flags & KERNFS_ACTIVATED) kernfs_activate_one(kn); } else { kn->flags |= KERNFS_HIDDEN; if (kernfs_active(kn)) atomic_add(KN_DEACTIVATED_BIAS, &kn->active); kernfs_drain(kn); } up_write(&root->kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos, *parent; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) return; lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem); /* * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) return; pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { pos->flags |= KERNFS_REMOVING; if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); } /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() may drop kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); kernfs_drain(pos); parent = kernfs_parent(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = parent ? parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { struct kernfs_root *root; if (!kn) return; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); __kernfs_remove(kn); up_write(&root->kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. * * Return: %true if @kn is removed by this call, otherwise %false. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&root->kernfs_rwsem); schedule(); down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&root->kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * * Return: %0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; struct kernfs_root *root; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } root = kernfs_root(parent); down_write(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&root->kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag * * Return: %0 on success, -errno on failure. */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; struct kernfs_root *root; const char *old_name; int error; /* can't move or rename root */ if (!rcu_access_pointer(kn->__parent)) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; old_parent = kernfs_parent(kn); if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) { error = -EINVAL; if (WARN_ON_ONCE(old_parent != new_parent)) goto out; } error = 0; old_name = kernfs_rcu_name(kn); if (!new_name) new_name = old_name; if ((old_parent == new_parent) && (kn->ns == new_ns) && (strcmp(old_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(old_name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); /* rename_lock protects ->parent accessors */ if (old_parent != new_parent) { kernfs_get(new_parent); write_lock_irq(&kernfs_rename_lock); rcu_assign_pointer(kn->__parent, new_parent); kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); write_unlock_irq(&kernfs_rename_lock); kernfs_put(old_parent); } else { /* name assignment is RCU protected, parent is the same */ kn->ns = new_ns; if (new_name) rcu_assign_pointer(kn->name, new_name); } kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); kernfs_link_sibling(kn); if (new_name && !is_kernel_rodata((unsigned long)old_name)) kfree_rcu_mightsleep(old_name); error = 0; out: up_write(&root->kernfs_rwsem); return error; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && rcu_access_pointer(pos->__parent) == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; struct kernfs_root *root; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; root = kernfs_root(parent); down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = kernfs_rcu_name(pos); unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); if (!dir_emit(ctx, name, len, ino, type)) { up_read(&root->kernfs_rwsem); return 0; } } up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, };
1 1 4 3 1 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 // SPDX-License-Identifier: GPL-2.0 /* * Interface for controlling IO bandwidth on a request queue * * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" #include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ #define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) struct throtl_data { /* service tree for active throtl groups */ struct throtl_service_queue service_queue; struct request_queue *queue; /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; unsigned int throtl_slice; /* Work for dispatching throttled bios */ struct work_struct dispatch_work; bool track_bio_latency; }; static void throtl_pending_timer_fn(struct timer_list *t); static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); } /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest * * Return the throtl_grp @sq belongs to. If @sq is the top-level one * embedded in throtl_data, %NULL is returned. */ static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) { if (sq && sq->parent_sq) return container_of(sq, struct throtl_grp, service_queue); else return NULL; } /** * sq_to_td - return throtl_data the specified service queue belongs to * @sq: the throtl_service_queue of interest * * A service_queue can be embedded in either a throtl_grp or throtl_data. * Determine the associated throtl_data accordingly and return it. */ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) { struct throtl_grp *tg = sq_to_tg(sq); if (tg) return tg->td; else return container_of(sq, struct throtl_data, service_queue); } static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; return tg->iops[rw]; } /** * throtl_log - log debug message via blktrace * @sq: the service_queue being reported * @fmt: printf format string * @args: printf args * * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a * throtl_grp; otherwise, just "throtl". */ #define throtl_log(sq, fmt, args...) do { \ struct throtl_grp *__tg = sq_to_tg((sq)); \ struct throtl_data *__td = sq_to_td((sq)); \ \ (void)__td; \ if (likely(!blk_trace_note_message_enabled(__td->queue))) \ break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ } while (0) static inline unsigned int throtl_bio_data_size(struct bio *bio) { /* assume it's one sector */ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) return 512; return bio->bi_iter.bi_size; } static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); bio_list_init(&qn->bios); qn->tg = tg; } /** * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it * @bio: bio being added * @qn: qnode to add bio to * @queued: the service_queue->queued[] list @qn belongs to * * Add @bio to @qn and put @qn on @queued if it's not already on. * @qn->tg's reference count is bumped when @qn is activated. See the * comment on top of throtl_qnode definition for details. */ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, struct list_head *queued) { bio_list_add(&qn->bios, bio); if (list_empty(&qn->node)) { list_add_tail(&qn->node, queued); blkg_get(tg_to_blkg(qn->tg)); } } /** * throtl_peek_queued - peek the first bio on a qnode list * @queued: the qnode list to peek */ static struct bio *throtl_peek_queued(struct list_head *queued) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; } /** * throtl_pop_queued - pop the first bio form a qnode list * @queued: the qnode list to pop a bio from * @tg_to_put: optional out argument for throtl_grp to put * * Pop the first bio from the qnode list @queued. After popping, the first * qnode is removed from @queued if empty or moved to the end of @queued so * that the popping order is round-robin. * * When the first qnode is removed, its associated throtl_grp should be put * too. If @tg_to_put is NULL, this function automatically puts it; * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is * responsible for putting it. */ static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); if (bio_list_empty(&qn->bios)) { list_del_init(&qn->node); if (tg_to_put) *tg_to_put = qn->tg; else blkg_put(tg_to_blkg(qn->tg)); } else { list_move_tail(&qn->node, queued); } return bio; } /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[READ]); INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; if (blkg_rwstat_init(&tg->stat_bytes, gfp)) goto err_free_tg; if (blkg_rwstat_init(&tg->stat_ios, gfp)) goto err_exit_stat_bytes; throtl_service_queue_init(&tg->service_queue); for (rw = READ; rw <= WRITE; rw++) { throtl_qnode_init(&tg->qnode_on_self[rw], tg); throtl_qnode_init(&tg->qnode_on_parent[rw], tg); } RB_CLEAR_NODE(&tg->rb_node); tg->bps[READ] = U64_MAX; tg->bps[WRITE] = U64_MAX; tg->iops[READ] = UINT_MAX; tg->iops[WRITE] = UINT_MAX; return &tg->pd; err_exit_stat_bytes: blkg_rwstat_exit(&tg->stat_bytes); err_free_tg: kfree(tg); return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M * read_bps limit is set on a parent group, summary bps of * parent group and its subtree groups can't exceed 16M for the * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if * they're all separate root groups right below throtl_data. * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ sq->parent_sq = &td->service_queue; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; } /* * Set has_rules[] if @tg or any of its parents have limits configured. * This doesn't require walking up to the top of the hierarchy as the * parent's has_rules[] is guaranteed to be correct. */ static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || tg_bps_limit(tg, rw) != U64_MAX; } } static void throtl_pd_online(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ tg_update_has_rules(tg); } static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); timer_delete_sync(&tg->service_queue.pending_timer); blkg_rwstat_exit(&tg->stat_bytes); blkg_rwstat_exit(&tg->stat_ios); kfree(tg); } static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); if (!n) return NULL; return rb_entry_tg(n); } static void throtl_rb_erase(struct rb_node *n, struct throtl_service_queue *parent_sq) { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) { struct throtl_grp *tg; tg = throtl_rb_first(parent_sq); if (!tg) return; parent_sq->first_pending_disptime = tg->disptime; } static void tg_service_queue_add(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; struct rb_node *parent = NULL; struct throtl_grp *__tg; unsigned long key = tg->disptime; bool leftmost = true; while (*node != NULL) { parent = *node; __tg = rb_entry_tg(parent); if (time_before(key, __tg->disptime)) node = &parent->rb_left; else { node = &parent->rb_right; leftmost = false; } } rb_link_node(&tg->rb_node, parent, node); rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, leftmost); } static void throtl_enqueue_tg(struct throtl_grp *tg) { if (!(tg->flags & THROTL_TG_PENDING)) { tg_service_queue_add(tg); tg->flags |= THROTL_TG_PENDING; tg->service_queue.parent_sq->nr_pending++; } } static void throtl_dequeue_tg(struct throtl_grp *tg) { if (tg->flags & THROTL_TG_PENDING) { struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; throtl_rb_erase(&tg->rb_node, parent_sq); --parent_sq->nr_pending; tg->flags &= ~THROTL_TG_PENDING; } } /* Call with queue lock held */ static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, unsigned long expires) { unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; /* * Since we are adjusting the throttle limit dynamically, the sleep * time calculated according to previous limit might be invalid. It's * possible the cgroup sleep time is very long and no other cgroups * have IO running so notify the limit changes. Make sure the cgroup * doesn't sleep too long to avoid the missed notification. */ if (time_after(expires, max_expire)) expires = max_expire; mod_timer(&sq->pending_timer, expires); throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", expires - jiffies, jiffies); } /** * throtl_schedule_next_dispatch - schedule the next dispatch cycle * @sq: the service_queue to schedule dispatch for * @force: force scheduling * * Arm @sq->pending_timer so that the next dispatch cycle starts on the * dispatch time of the first pending child. Returns %true if either timer * is armed or there's no pending child left. %false if the current * dispatch window is still open and the caller should continue * dispatching. * * If @force is %true, the dispatch timer is always scheduled and this * function is guaranteed to return %true. This is to be used when the * caller can't dispatch itself and needs to invoke pending_timer * unconditionally. Note that forced scheduling is likely to induce short * delay before dispatch starts even if @sq->first_pending_disptime is not * in the future and thus shouldn't be used in hot paths. */ static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, bool force) { /* any pending children left? */ if (!sq->nr_pending) return true; update_min_dispatch_time(sq); /* is the next dispatch time in the future? */ if (force || time_after(sq->first_pending_disptime, jiffies)) { throtl_schedule_pending_timer(sq, sq->first_pending_disptime); return true; } /* tell the caller to continue dispatching */ return false; } static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, bool rw, unsigned long start) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, bool clear) { if (clear) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; } tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); } static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], tg->slice_end[rw], jiffies); } /* Determine if previously allocated or extended slice is complete or not */ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) { if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) return false; return true; } static unsigned int calculate_io_allowed(u32 iops_limit, unsigned long jiffy_elapsed) { unsigned int io_allowed; u64 tmp; /* * jiffy_elapsed should not be a big value as minimum iops can be * 1 then at max jiffy elapsed should be equivalent of 1 second as we * will allow dispatch after 1 second and after that slice should * have been trimmed. */ tmp = (u64)iops_limit * jiffy_elapsed; do_div(tmp, HZ); if (tmp > UINT_MAX) io_allowed = UINT_MAX; else io_allowed = tmp; return io_allowed; } static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) { /* * Can result be wider than 64 bits? * We check against 62, not 64, due to ilog2 truncation. */ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) return U64_MAX; return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); } /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { unsigned long time_elapsed; long long bytes_trim; int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); /* * If bps are unlimited (-1), then time slice don't get * renewed. Don't try to trim the slice if slice is used. A new * slice will start when appropriate. */ if (throtl_slice_used(tg, rw)) return; /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); time_elapsed = rounddown(jiffies - tg->slice_start[rw], tg->td->throtl_slice); /* Don't trim slice until at least 2 slices are used */ if (time_elapsed < tg->td->throtl_slice * 2) return; /* * The bio submission time may be a few jiffies more than the expected * waiting time, due to 'extra_bytes' can't be divided in * tg_within_bps_limit(), and also due to timer wakeup delay. In this * case, adjust slice_start will discard the extra wait time, causing * lower rate than expected. Therefore, other than the above rounddown, * one extra slice is preserved for deviation. */ time_elapsed -= tg->td->throtl_slice; bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), time_elapsed); io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed); if (bytes_trim <= 0 && io_trim <= 0) return; if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw, long long *bytes, int *ios) { unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * If config is updated while bios are still throttled, calculate and * accumulate how many bytes/ios are waited across changes. And * carryover_bytes/ios will be used to calculate new wait time under new * configuration. */ if (bps_limit != U64_MAX) *bytes = calculate_bytes_allowed(bps_limit, jiffy_elapsed) - tg->bytes_disp[rw]; if (iops_limit != UINT_MAX) *ios = calculate_io_allowed(iops_limit, jiffy_elapsed) - tg->io_disp[rw]; tg->bytes_disp[rw] -= *bytes; tg->io_disp[rw] -= *ios; } static void tg_update_carryover(struct throtl_grp *tg) { long long bytes[2] = {0}; int ios[2] = {0}; if (tg->service_queue.nr_queued[READ]) __tg_update_carryover(tg, READ, &bytes[READ], &ios[READ]); if (tg->service_queue.nr_queued[WRITE]) __tg_update_carryover(tg, WRITE, &bytes[WRITE], &ios[WRITE]); /* see comments in struct throtl_grp for meaning of these fields. */ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, bytes[READ], bytes[WRITE], ios[READ], ios[WRITE]); } static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, u32 iops_limit) { bool rw = bio_data_dir(bio); int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; /* Round up to the next throttle slice, wait time must be nonzero */ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd); if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) return 0; /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; /* make sure at least one io can be dispatched after waiting */ jiffy_wait = max(jiffy_wait, HZ / iops_limit + 1); return jiffy_wait; } static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, u64 bps_limit) { bool rw = bio_data_dir(bio); long long bytes_allowed; u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ if (!jiffy_elapsed) jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd); if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; /* * This wait time is without taking into consideration the rounding * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); return jiffy_wait; } /* * Returns whether one can dispatch a bio or not. Also returns approx number * of jiffies to wait before this bio is with-in IO rate and can be dispatched */ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, unsigned long *wait) { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; u64 bps_limit = tg_bps_limit(tg, rw); u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio * queued in the group bio list. So one should not be calling * this function with a different bio if there are other bios * queued. */ BUG_ON(tg->service_queue.nr_queued[rw] && bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; } /* * If previous slice expired, start a new one otherwise renew/extend * existing slice to make sure it is at least throtl_slice interval * long since now. New slice is started only for empty throttle group. * If there is queued bio, that means there should be an active * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) throtl_extend_slice(tg, rw, jiffies + tg->td->throtl_slice); } bps_wait = tg_within_bps_limit(tg, bio, bps_limit); iops_wait = tg_within_iops_limit(tg, bio, iops_limit); if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; } max_wait = max(bps_wait, iops_wait); if (wait) *wait = max_wait; if (time_before(tg->slice_end[rw], jiffies + max_wait)) throtl_extend_slice(tg, rw, jiffies + max_wait); return false; } static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ if (!bio_flagged(bio, BIO_BPS_THROTTLED)) tg->bytes_disp[rw] += bio_size; tg->io_disp[rw]++; } /** * throtl_add_bio_tg - add a bio to the specified throtl_grp * @bio: bio to add * @qn: qnode to use * @tg: the target throtl_grp * * Add @bio to @tg's service_queue using @qn. If @qn is not specified, * tg->qnode_on_self[] is used. */ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; bool rw = bio_data_dir(bio); if (!qn) qn = &tg->qnode_on_self[rw]; /* * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); sq->nr_queued[rw]++; throtl_enqueue_tg(tg); } static void tg_update_disptime(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; struct bio *bio; bio = throtl_peek_queued(&sq->queued[READ]); if (bio) tg_may_dispatch(tg, bio, &read_wait); bio = throtl_peek_queued(&sq->queued[WRITE]); if (bio) tg_may_dispatch(tg, bio, &write_wait); min_wait = min(read_wait, write_wait); disptime = jiffies + min_wait; /* Update dispatch time */ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; } static void start_parent_slice_with_credit(struct throtl_grp *child_tg, struct throtl_grp *parent_tg, bool rw) { if (throtl_slice_used(parent_tg, rw)) { throtl_start_new_slice_with_credit(parent_tg, rw, child_tg->slice_start[rw]); } } static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) { struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *parent_sq = sq->parent_sq; struct throtl_grp *parent_tg = sq_to_tg(parent_sq); struct throtl_grp *tg_to_put = NULL; struct bio *bio; /* * @bio is being transferred from @tg to @parent_sq. Popping a bio * from @tg may put its reference and @parent_sq might end up * getting released prematurely. Remember the tg to put and put it * after @bio is transferred to @parent_sq. */ bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); /* * If our parent is another tg, we just need to transfer @bio to * the parent using throtl_add_bio_tg(). If our parent is * @td->service_queue, @bio is ready to be issued. Put it on its * bio_lists[] and decrease total number queued. The caller is * responsible for issuing these bios. */ if (parent_tg) { throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); tg->td->nr_queued[rw]--; } throtl_trim_slice(tg, rw); if (tg_to_put) blkg_put(tg_to_blkg(tg_to_put)); } static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) break; } while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) break; } return nr_reads + nr_writes; } static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) { unsigned int nr_disp = 0; while (1) { struct throtl_grp *tg; struct throtl_service_queue *sq; if (!parent_sq->nr_pending) break; tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); else throtl_dequeue_tg(tg); if (nr_disp >= THROTL_QUANTUM) break; } return nr_disp; } /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced * * This timer is armed when a child throtl_grp with active bio's become * pending and queued on the service_queue's pending_tree and expires when * the first child throtl_grp should be dispatched. This function * dispatches bio's from the children throtl_grps to the parent * service_queue. * * If the parent's parent is another throtl_grp, dispatching is propagated * by either arming its pending_timer or repeating dispatch directly. If * the top-level service_tree is reached, throtl_data->dispatch_work is * kicked so that the ready bio's are issued. */ static void throtl_pending_timer_fn(struct timer_list *t) { struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); struct throtl_service_queue *parent_sq; struct request_queue *q; bool dispatched; int ret; /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) q = tg->pd.blkg->q; else q = td->queue; spin_lock_irq(&q->queue_lock); if (!q->root_blkg) goto out_unlock; again: parent_sq = sq->parent_sq; dispatched = false; while (true) { throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", sq->nr_queued[READ] + sq->nr_queued[WRITE], sq->nr_queued[READ], sq->nr_queued[WRITE]); ret = throtl_select_dispatch(sq); if (ret) { throtl_log(sq, "bios disp=%u", ret); dispatched = true; } if (throtl_schedule_next_dispatch(sq, false)) break; /* this dispatch windows is still open, relax and repeat */ spin_unlock_irq(&q->queue_lock); cpu_relax(); spin_lock_irq(&q->queue_lock); } if (!dispatched) goto out_unlock; if (parent_sq) { /* @parent_sq is another throl_grp, propagate dispatch */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); if (!throtl_schedule_next_dispatch(parent_sq, false)) { /* window is already open, repeat dispatching */ sq = parent_sq; tg = sq_to_tg(sq); goto again; } } } else { /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: spin_unlock_irq(&q->queue_lock); } /** * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * * This function is queued for execution when bios reach the bio_lists[] * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) { struct throtl_data *td = container_of(work, struct throtl_data, dispatch_work); struct throtl_service_queue *td_sq = &td->service_queue; struct request_queue *q = td->queue; struct bio_list bio_list_on_stack; struct bio *bio; struct blk_plug plug; int rw; bio_list_init(&bio_list_on_stack); spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); u64 v = *(u64 *)((void *)tg + off); if (v == U64_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); unsigned int v = *(unsigned int *)((void *)tg + off); if (v == UINT_MAX) return 0; return __blkg_prfill_u64(sf, pd, v); } static int tg_print_conf_u64(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static int tg_print_conf_uint(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static void tg_conf_updated(struct throtl_grp *tg, bool global) { struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its * ancestors has rules. This identifies groups without any * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; } rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's * apply the new config directly. * * Restart the slices for both READ and WRITES. It might happen * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ throtl_start_new_slice(tg, READ, false); throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } } static int blk_throtl_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct throtl_data *td; unsigned int memflags; int ret; td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); if (!td) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); throtl_service_queue_init(&td->service_queue); /* * Freeze queue before activating policy, to synchronize with IO path, * which is protected by 'q_usage_counter'. */ memflags = blk_mq_freeze_queue(disk->queue); blk_mq_quiesce_queue(disk->queue); q->td = td; td->queue = q; /* activate policy */ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { q->td = NULL; kfree(td); goto out; } if (blk_queue_nonrot(q)) td->throtl_slice = DFL_THROTL_SLICE_SSD; else td->throtl_slice = DFL_THROTL_SLICE_HD; td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); out: blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue, memflags); return ret; } static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; int ret; u64 v; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) goto out_finish; if (!v) v = U64_MAX; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; else *(unsigned int *)((void *)tg + of_cft(of)->private) = v; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, true); } static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return tg_set_conf(of, buf, nbytes, off, false); } static int tg_print_rwstat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct blkg_rwstat_sample sum; blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, &sum); return __blkg_prfill_rwstat(sf, pd, &sum); } static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_rwstat_recursive, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_service_bytes_recursive", .private = offsetof(struct throtl_grp, stat_bytes), .seq_show = tg_print_rwstat_recursive, }, { .name = "throttle.io_serviced", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced_recursive", .private = offsetof(struct throtl_grp, stat_ios), .seq_show = tg_print_rwstat_recursive, }, { } /* terminate */ }; static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); u64 bps_dft; unsigned int iops_dft; if (!dname) return 0; bps_dft = U64_MAX; iops_dft = UINT_MAX; if (tg->bps[READ] == bps_dft && tg->bps[WRITE] == bps_dft && tg->iops[READ] == iops_dft && tg->iops[WRITE] == iops_dft) return 0; seq_printf(sf, "%s", dname); if (tg->bps[READ] == U64_MAX) seq_printf(sf, " rbps=max"); else seq_printf(sf, " rbps=%llu", tg->bps[READ]); if (tg->bps[WRITE] == U64_MAX) seq_printf(sf, " wbps=max"); else seq_printf(sf, " wbps=%llu", tg->bps[WRITE]); if (tg->iops[READ] == UINT_MAX) seq_printf(sf, " riops=max"); else seq_printf(sf, " riops=%u", tg->iops[READ]); if (tg->iops[WRITE] == UINT_MAX) seq_printf(sf, " wiops=max"); else seq_printf(sf, " wiops=%u", tg->iops[WRITE]); seq_printf(sf, "\n"); return 0; } static int tg_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } static ssize_t tg_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; int ret; blkg_conf_init(&ctx, buf); ret = blkg_conf_open_bdev(&ctx); if (ret) goto out_finish; if (!blk_throtl_activated(ctx.bdev->bd_queue)) { ret = blk_throtl_init(ctx.bdev->bd_disk); if (ret) goto out_finish; } ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); v[0] = tg->bps[READ]; v[1] = tg->bps[WRITE]; v[2] = tg->iops[READ]; v[3] = tg->iops[WRITE]; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; u64 val = U64_MAX; int len; if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) break; if (tok[0] == '\0') break; ctx.body += len; ret = -EINVAL; p = tok; strsep(&p, "="); if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) goto out_finish; ret = -ERANGE; if (!val) goto out_finish; ret = -EINVAL; if (!strcmp(tok, "rbps")) v[0] = val; else if (!strcmp(tok, "wbps")) v[1] = val; else if (!strcmp(tok, "riops")) v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops")) v[3] = min_t(u64, val, UINT_MAX); else goto out_finish; } tg->bps[READ] = v[0]; tg->bps[WRITE] = v[1]; tg->iops[READ] = v[2]; tg->iops[WRITE] = v[3]; tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); return ret ?: nbytes; } static struct cftype throtl_files[] = { { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, }, { } /* terminate */ }; static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; cancel_work_sync(&td->dispatch_work); } static void tg_flush_bios(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; if (tg->flags & THROTL_TG_CANCELING) return; /* * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ tg->flags |= THROTL_TG_CANCELING; /* * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup * will be inserted to service queue without THROTL_TG_PENDING * set in tg_update_disptime below. Then IO dispatched from * child in tg_dispatch_one_bio will trigger double insertion * and corrupt the tree. */ if (!(tg->flags & THROTL_TG_PENDING)) return; /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. */ tg_update_disptime(tg); throtl_schedule_pending_timer(sq, jiffies + 1); } static void throtl_pd_offline(struct blkg_policy_data *pd) { tg_flush_bios(pd_to_tg(pd)); } struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; void blk_throtl_cancel_bios(struct gendisk *disk) { struct request_queue *q = disk->queue; struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; if (!blk_throtl_activated(q)) return; spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. * However, rcu lock is still held to emphasize that following * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { /* * disk_release will call pd_offline_fn to cancel bios. * However, disk_release can't be called if someone get * the refcount of device and issued bios which are * inflight after del_gendisk. * Cancel bios here to ensure no bios are inflight after * del_gendisk. */ tg_flush_bios(blkg_to_tg(blkg)); } rcu_read_unlock(); spin_unlock_irq(&q->queue_lock); } static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) { /* throtl is FIFO - if bios are already queued, should queue */ if (tg->service_queue.nr_queued[rw]) return false; return tg_may_dispatch(tg, bio, NULL); } bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; rcu_read_lock(); spin_lock_irq(&q->queue_lock); sq = &tg->service_queue; while (true) { if (tg_within_limit(tg, bio, rw)) { /* within limits, let's charge and dispatch directly */ throtl_charge_bio(tg, bio); /* * We need to trim slice even when bios are not being * queued otherwise it might happen that a bio is not * queued for a long time and slice keeps on extending * and trim is not called for a long time. Now if limits * are reduced suddenly we take into account all the IO * dispatched so far at new low rate and * newly queued * IO gets a really long dispatch time. * * So keep on trimming slice even if bio is not queued. */ throtl_trim_slice(tg, rw); } else if (bio_issue_as_root_blkg(bio)) { /* * IOs which may cause priority inversions are * dispatched directly, even if they're over limit. * * Charge and dispatch directly, and our throttle * control algorithm is adaptive, and extra IO bytes * will be throttled for paying the debt */ throtl_charge_bio(tg, bio); } else { /* if above limits, break to queue */ break; } /* * @bio passed through this layer without being throttled. * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); if (!tg) { bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; } } /* out-of-limit, queue to @tg */ throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", rw == READ ? 'R' : 'W', tg->bytes_disp[rw], bio->bi_iter.bi_size, tg_bps_limit(tg, rw), tg->io_disp[rw], tg_iops_limit(tg, rw), sq->nr_queued[READ], sq->nr_queued[WRITE]); td->nr_queued[rw]++; throtl_add_bio_tg(bio, qn, tg); throttled = true; /* * Update @tg's dispatch time and force schedule dispatch if @tg * was empty before @bio. The forced scheduling isn't likely to * cause undue delay as @bio is likely to be dispatched directly if * its @tg's disptime is not in the future. */ if (tg->flags & THROTL_TG_WAS_EMPTY) { tg_update_disptime(tg); throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); } out_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; if (!blk_throtl_activated(q)) return; timer_delete_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); kfree(q->td); } static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); if (!kthrotld_workqueue) panic("Failed to create kthrotld\n"); return blkcg_policy_register(&blkcg_policy_throtl); } module_init(throtl_init);
1 1 245 48 262 201 5 16 34 18 8 195 143 136 4 111 168 10 43 176 177 43 18 12 111 112 4 4 4 143 140 143 7 141 143 155 16 128 11 21 47 118 21 100 188 187 48 145 187 188 188 136 55 115 115 114 115 24 95 114 115 115 114 115 3 110 3 8 1 4 4 4 6 4 8 1 1 2 1 2 2 10 2 8 3 2 1 10 3 2 292 261 32 12 282 282 282 282 2 9 142 41 41 330 133 2 122 39 36 2 39 211 324 1 352 4 3 12 23 11 12 11 1 85 333 47 47 16 12 143 133 16 3 123 120 111 3 290 107 293 11 181 18 152 412 2 2 391 1 17 352 406 1 1 1 405 112 290 390 102 301 263 11 47 144 72 185 420 2 416 6 186 255 6 2 7 1 178 170 257 420 4 4 4 4 118 3 115 118 122 122 3 120 3 3 1 2 2 3 3 3 1 2 1 5 3 2 2 1 8 2 8 5 8 3 5 25 18 6 1 11 9 7 3 8 5 12 3 140 3 110 384 251 142 143 422 422 1 13 384 21 4 3 2 1 19 1 1 6 2 8 6 2 4 4 4 4 8 4 6 6 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_api.c Packet action API. * * Author: Jamal Hadi Salim */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_pedit.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/tc_wrapper.h> #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); EXPORT_SYMBOL_GPL(tcf_frag_xmit_count); #endif int tcf_dev_queue_xmit(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)) { #ifdef CONFIG_INET if (static_branch_unlikely(&tcf_frag_xmit_count)) return sch_frag_xmit_hook(skb, xmit); #endif return xmit(skb); } EXPORT_SYMBOL_GPL(tcf_dev_queue_xmit); static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { const struct tcf_chain *chain = rcu_dereference_bh(a->goto_chain); res->goto_tp = rcu_dereference_bh(chain->filter_chain); } static void tcf_free_cookie_rcu(struct rcu_head *p) { struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); kfree(cookie->data); kfree(cookie); } static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, struct tc_cookie *new_cookie) { struct tc_cookie *old; old = unrcu_pointer(xchg(old_cookie, RCU_INITIALIZER(new_cookie))); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } int tcf_action_check_ctrlact(int action, struct tcf_proto *tp, struct tcf_chain **newchain, struct netlink_ext_ack *extack) { int opcode = TC_ACT_EXT_OPCODE(action), ret = -EINVAL; u32 chain_index; if (!opcode) ret = action > TC_ACT_VALUE_MAX ? -EINVAL : 0; else if (opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC) ret = 0; if (ret) { NL_SET_ERR_MSG(extack, "invalid control action"); goto end; } if (TC_ACT_EXT_CMP(action, TC_ACT_GOTO_CHAIN)) { chain_index = action & TC_ACT_EXT_VAL_MASK; if (!tp || !newchain) { ret = -EINVAL; NL_SET_ERR_MSG(extack, "can't goto NULL proto/chain"); goto end; } *newchain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!*newchain) { ret = -ENOMEM; NL_SET_ERR_MSG(extack, "can't allocate goto_chain"); } } end: return ret; } EXPORT_SYMBOL(tcf_action_check_ctrlact); struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action, struct tcf_chain *goto_chain) { a->tcfa_action = action; goto_chain = rcu_replace_pointer(a->goto_chain, goto_chain, 1); return goto_chain; } EXPORT_SYMBOL(tcf_action_set_ctrlact); /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected * from filters. Readers later can not find us. */ static void free_tcf(struct tc_action *p) { struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1); free_percpu(p->cpu_bstats); free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->user_cookie, NULL); if (chain) tcf_chain_put_by_act(chain); kfree(p); } static void offload_action_hw_count_set(struct tc_action *act, u32 hw_count) { act->in_hw_count = hw_count; } static void offload_action_hw_count_inc(struct tc_action *act, u32 hw_count) { act->in_hw_count += hw_count; } static void offload_action_hw_count_dec(struct tc_action *act, u32 hw_count) { act->in_hw_count = act->in_hw_count > hw_count ? act->in_hw_count - hw_count : 0; } static unsigned int tcf_offload_act_num_actions_single(struct tc_action *act) { if (is_tcf_pedit(act)) return tcf_pedit_nkeys(act); else return 1; } static bool tc_act_skip_hw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_HW) ? true : false; } static bool tc_act_skip_sw(u32 flags) { return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false; } /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static bool tc_act_flags_valid(u32 flags) { flags &= TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW; return flags ^ (TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW); } static int offload_action_init(struct flow_offload_action *fl_action, struct tc_action *act, enum offload_act_command cmd, struct netlink_ext_ack *extack) { int err; fl_action->extack = extack; fl_action->command = cmd; fl_action->index = act->tcfa_index; fl_action->cookie = (unsigned long)act; if (act->ops->offload_act_setup) { spin_lock_bh(&act->tcfa_lock); err = act->ops->offload_act_setup(act, fl_action, NULL, false, extack); spin_unlock_bh(&act->tcfa_lock); return err; } return -EOPNOTSUPP; } static int tcf_action_offload_cmd_ex(struct flow_offload_action *fl_act, u32 *hw_count) { int err; err = flow_indr_dev_setup_offload(NULL, NULL, TC_SETUP_ACT, fl_act, NULL, NULL); if (err < 0) return err; if (hw_count) *hw_count = err; return 0; } static int tcf_action_offload_cmd_cb_ex(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { int err; err = cb(NULL, NULL, cb_priv, TC_SETUP_ACT, NULL, fl_act, NULL); if (err < 0) return err; if (hw_count) *hw_count = 1; return 0; } static int tcf_action_offload_cmd(struct flow_offload_action *fl_act, u32 *hw_count, flow_indr_block_bind_cb_t *cb, void *cb_priv) { return cb ? tcf_action_offload_cmd_cb_ex(fl_act, hw_count, cb, cb_priv) : tcf_action_offload_cmd_ex(fl_act, hw_count); } static int tcf_action_offload_add_ex(struct tc_action *action, struct netlink_ext_ack *extack, flow_indr_block_bind_cb_t *cb, void *cb_priv) { bool skip_sw = tc_act_skip_sw(action->tcfa_flags); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct flow_offload_action *fl_action; u32 in_hw_count = 0; int num, err = 0; if (tc_act_skip_hw(action->tcfa_flags)) return 0; num = tcf_offload_act_num_actions_single(action); fl_action = offload_action_alloc(num); if (!fl_action) return -ENOMEM; err = offload_action_init(fl_action, action, FLOW_ACT_REPLACE, extack); if (err) goto fl_err; err = tc_setup_action(&fl_action->action, actions, 0, extack); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to setup tc actions for offload"); goto fl_err; } err = tcf_action_offload_cmd(fl_action, &in_hw_count, cb, cb_priv); if (!err) cb ? offload_action_hw_count_inc(action, in_hw_count) : offload_action_hw_count_set(action, in_hw_count); if (skip_sw && !tc_act_in_hw(action)) err = -EINVAL; tc_cleanup_offload_action(&fl_action->action); fl_err: kfree(fl_action); return err; } /* offload the tc action after it is inserted */ static int tcf_action_offload_add(struct tc_action *action, struct netlink_ext_ack *extack) { return tcf_action_offload_add_ex(action, extack, NULL, NULL); } int tcf_action_update_hw_stats(struct tc_action *action) { struct flow_offload_action fl_act = {}; int err; err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, NULL, NULL, NULL); if (!err) { preempt_disable(); tcf_action_stats_update(action, fl_act.stats.bytes, fl_act.stats.pkts, fl_act.stats.drops, fl_act.stats.lastused, true); preempt_enable(); action->used_hw_stats = fl_act.stats.used_hw_stats; action->used_hw_stats_valid = true; } else { return -EOPNOTSUPP; } return 0; } EXPORT_SYMBOL(tcf_action_update_hw_stats); static int tcf_action_offload_del_ex(struct tc_action *action, flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_offload_action fl_act = {}; u32 in_hw_count = 0; int err = 0; if (!tc_act_in_hw(action)) return 0; err = offload_action_init(&fl_act, action, FLOW_ACT_DESTROY, NULL); if (err) return err; err = tcf_action_offload_cmd(&fl_act, &in_hw_count, cb, cb_priv); if (err < 0) return err; if (!cb && action->in_hw_count != in_hw_count) return -EINVAL; /* do not need to update hw state when deleting action */ if (cb && in_hw_count) offload_action_hw_count_dec(action, in_hw_count); return 0; } static int tcf_action_offload_del(struct tc_action *action) { return tcf_action_offload_del_ex(action, NULL, NULL); } static void tcf_action_cleanup(struct tc_action *p) { tcf_action_offload_del(p); if (p->ops->cleanup) p->ops->cleanup(p); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; if (refcount_dec_and_mutex_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; } if (bind) atomic_dec(&p->tcfa_bindcnt); return 0; } static int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was * also created with act API (unbinding last classifier will destroy the * action if it was created by classifier). So only case when bind count * can be changed after initial check is when unbound action is * destroyed by act API while classifier binds to action with same id * concurrently. This result either creation of new action(same behavior * as before), or reusing existing action if concurrent process * increments reference count before action is deleted. Both scenarios * are acceptable. */ if (p) { if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; } return ret; } int tcf_idr_release(struct tc_action *a, bool bind) { const struct tc_action_ops *ops = a->ops; int ret; ret = __tcf_idr_release(a, bind, false); if (ret == ACT_P_DELETED) module_put(ops->owner); return ret; } EXPORT_SYMBOL(tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { struct tc_cookie *user_cookie; u32 cookie_len = 0; rcu_read_lock(); user_cookie = rcu_dereference(act->user_cookie); if (user_cookie) cookie_len = nla_total_size(user_cookie->len); rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_HW_STATS */ + nla_total_size(0) /* TCA_ACT_STATS nested */ + nla_total_size(sizeof(struct nla_bitfield32)) /* TCA_ACT_FLAGS */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_PKT64 */ + nla_total_size_64bit(sizeof(u64)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_ACT_OPTIONS nested */ + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ } static size_t tcf_action_full_attrs_size(size_t sz) { return NLMSG_HDRLEN /* struct nlmsghdr */ + sizeof(struct tcamsg) + nla_total_size(0) /* TCA_ACT_TAB nested */ + sz; } static size_t tcf_action_fill_size(const struct tc_action *act) { size_t sz = tcf_action_shared_attrs_size(act); if (act->ops->get_fill_size) return act->ops->get_fill_size(act) + sz; return sz; } static int tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) { unsigned char *b = skb_tail_pointer(skb); struct tc_cookie *cookie; if (nla_put_string(skb, TCA_ACT_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; if (from_act && nla_put_u32(skb, TCA_ACT_INDEX, a->tcfa_index)) goto nla_put_failure; rcu_read_lock(); cookie = rcu_dereference(a->user_cookie); if (cookie) { if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); return 0; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; int err = -EINVAL; u32 flags; if (tcf_action_dump_terse(skb, a, false)) goto nla_put_failure; if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; if (a->used_hw_stats_valid && nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS, a->used_hw_stats, TCA_ACT_HW_STATS_ANY)) goto nla_put_failure; flags = a->tcfa_flags & TCA_ACT_FLAGS_USER_MASK; if (flags && nla_put_bitfield32(skb, TCA_ACT_FLAGS, flags, flags)) goto nla_put_failure; if (nla_put_u32(skb, TCA_ACT_IN_HW_COUNT, a->in_hw_count)) goto nla_put_failure; nest = nla_nest_start_noflag(skb, TCA_ACT_OPTIONS); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_old(skb, a, bind, ref); if (err > 0) { nla_nest_end(skb, nest); return err; } nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, s_i = 0, n_i = 0; u32 act_flags = cb->args[2]; unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; mutex_lock(&idrinfo->lock); s_i = cb->args[0]; idr_for_each_entry_ul(idr, p, tmp, id) { index++; if (index < s_i) continue; if (IS_ERR(p)) continue; if (jiffy_since && time_after(jiffy_since, (unsigned long)p->tcfa_tm.lastuse)) continue; tcf_action_update_hw_stats(p); nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; goto nla_put_failure; } err = (act_flags & TCA_ACT_FLAG_TERSE_DUMP) ? tcf_action_dump_terse(skb, p, true) : tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); goto done; } nla_nest_end(skb, nest); n_i++; if (!(act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } done: if (index >= 0) cb->args[0] = index + 1; mutex_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_ACT_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; nla_put_failure: nla_nest_cancel(skb, nest); goto done; } static int tcf_idr_release_unsafe(struct tc_action *p) { if (atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (refcount_dec_and_test(&p->tcfa_refcnt)) { idr_remove(&p->idrinfo->action_idr, p->tcfa_index); tcf_action_cleanup(p); return ACT_P_DELETED; } return 0; } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct nlattr *nest; int n_i = 0; int ret = -EINVAL; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; unsigned long tmp; nest = nla_nest_start_noflag(skb, 0); if (nest == NULL) goto nla_put_failure; if (nla_put_string(skb, TCA_ACT_KIND, ops->kind)) goto nla_put_failure; ret = 0; mutex_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p)) continue; ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) break; n_i++; } mutex_unlock(&idrinfo->lock); if (ret < 0) { if (n_i) NL_SET_ERR_MSG(extack, "Unable to flush all TC actions"); else goto nla_put_failure; } ret = nla_put_u32(skb, TCA_FCNT, n_i); if (ret) goto nla_put_failure; nla_nest_end(skb, nest); return n_i; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { return tcf_del_walker(idrinfo, skb, ops, extack); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown command %d\n", type); NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } EXPORT_SYMBOL(tcf_generic_walker); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); mutex_unlock(&idrinfo->lock); if (p) { *a = p; return true; } return false; } EXPORT_SYMBOL(tcf_idr_search); static int __tcf_generic_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->walk)) return ops->walk(net, skb, cb, type, ops, extack); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int __tcf_idr_search(struct net *net, const struct tc_action_ops *ops, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, ops->net_id); if (unlikely(ops->lookup)) return ops->lookup(net, a, index); return tcf_idr_search(tn, a, index); } static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) { struct tc_action *p; int ret = 0; mutex_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { mutex_unlock(&idrinfo->lock); return -ENOENT; } if (!atomic_read(&p->tcfa_bindcnt)) { if (refcount_dec_and_test(&p->tcfa_refcnt)) { struct module *owner = p->ops->owner; WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); mutex_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); return 0; } ret = 0; } else { ret = -EPERM; } mutex_unlock(&idrinfo->lock); return ret; } int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats, u32 flags) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; refcount_set(&p->tcfa_refcnt, 1); if (bind) atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } gnet_stats_basic_sync_init(&p->tcfa_bstats); gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; p->tcfa_flags = flags; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, &p->tcfa_lock, false, est); if (err) goto err4; } p->idrinfo = idrinfo; __module_get(ops->owner); p->ops = ops; *a = p; return 0; err4: free_percpu(p->cpu_qstats); err3: free_percpu(p->cpu_bstats_hw); err2: free_percpu(p->cpu_bstats); err1: kfree(p); return err; } EXPORT_SYMBOL(tcf_idr_create); int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags) { /* Set cpustats according to actions flags. */ return tcf_idr_create(tn, index, est, a, ops, bind, !(flags & TCA_ACT_FLAGS_NO_PERCPU_STATS), flags); } EXPORT_SYMBOL(tcf_idr_create_from_flags); /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; mutex_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); mutex_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); /* Check if action with specified index exists. If actions is found, increments * its reference and bind counters, and return 1. Otherwise insert temporary * error pointer (to prevent concurrent users from inserting actions with same * index) and return 0. * * May return -EAGAIN for binding actions in case of a parallel add/delete on * the requested index. */ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; int ret; u32 max; if (*index) { rcu_read_lock(); p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ rcu_read_unlock(); return -EAGAIN; } if (!p) { /* Empty slot, try to allocate it */ max = *index; rcu_read_unlock(); goto new; } if (!refcount_inc_not_zero(&p->tcfa_refcnt)) { /* Action was deleted in parallel */ rcu_read_unlock(); return -EAGAIN; } if (bind) atomic_inc(&p->tcfa_bindcnt); *a = p; rcu_read_unlock(); return 1; } else { /* Find a slot */ *index = 1; max = UINT_MAX; } new: *a = NULL; mutex_lock(&idrinfo->lock); ret = idr_alloc_u32(&idrinfo->action_idr, ERR_PTR(-EBUSY), index, max, GFP_KERNEL); mutex_unlock(&idrinfo->lock); /* N binds raced for action allocation, * retry for all the ones that failed. */ if (ret == -ENOSPC && *index == max) ret = -EAGAIN; return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; struct tc_action *p; int ret; unsigned long id = 1; unsigned long tmp; idr_for_each_entry_ul(idr, p, tmp, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); /* since act ops id is stored in pernet subsystem list, * then there is no way to walk through only all the action * subsystem, so we keep tc action pernet ops id for * reoffload to walk through. */ static LIST_HEAD(act_pernet_id_list); static DEFINE_MUTEX(act_id_mutex); struct tc_act_pernet_id { struct list_head list; unsigned int id; }; static int tcf_pernet_add_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; int ret = 0; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { ret = -EEXIST; goto err_out; } } id_ptr = kzalloc(sizeof(*id_ptr), GFP_KERNEL); if (!id_ptr) { ret = -ENOMEM; goto err_out; } id_ptr->id = id; list_add_tail(&id_ptr->list, &act_pernet_id_list); err_out: mutex_unlock(&act_id_mutex); return ret; } static void tcf_pernet_del_id_list(unsigned int id) { struct tc_act_pernet_id *id_ptr; mutex_lock(&act_id_mutex); list_for_each_entry(id_ptr, &act_pernet_id_list, list) { if (id_ptr->id == id) { list_del(&id_ptr->list); kfree(id_ptr); break; } } mutex_unlock(&act_id_mutex); } int tcf_register_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int ret; if (!act->act || !act->dump || !act->init) return -EINVAL; /* We have to register pernet ops before making the action ops visible, * otherwise tcf_action_init_1() could get a partially initialized * netns. */ ret = register_pernet_subsys(ops); if (ret) return ret; if (ops->id) { ret = tcf_pernet_add_id_list(*ops->id); if (ret) goto err_id; } write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) { ret = -EEXIST; goto err_out; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; err_out: write_unlock(&act_mod_lock); if (ops->id) tcf_pernet_del_id_list(*ops->id); err_id: unregister_pernet_subsys(ops); return ret; } EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); err = 0; break; } } write_unlock(&act_mod_lock); if (!err) { unregister_pernet_subsys(ops); if (ops->id) tcf_pernet_del_id_list(*ops->id); } return err; } EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /*TCA_ACT_MAX_PRIO is 32, there count up to 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { u32 jmp_prgcnt = 0; u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */ int i; int ret = TC_ACT_OK; if (skb_skip_tc_classify(skb)) return TC_ACT_OK; restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } if (tc_act_skip_sw(a->tcfa_flags)) continue; repeat_ttl = 32; repeat: ret = tc_act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; /* suspicious opcode, stop pipeline */ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); return TC_ACT_OK; } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { /* faulty opcode, stop pipeline */ return TC_ACT_OK; } else { jmp_ttl -= 1; if (jmp_ttl > 0) goto restart_act_graph; else /* faulty graph, stop pipeline */ return TC_ACT_OK; } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { if (unlikely(!rcu_access_pointer(a->goto_chain))) { tcf_set_drop_reason(skb, SKB_DROP_REASON_TC_CHAIN_NOTFOUND); return TC_ACT_SHOT; } tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) break; } return ret; } EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; struct tc_action *a; int ret = 0, i; tcf_act_for_each_action(i, a, actions) { actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return ret; } return ret; } static int tcf_action_put(struct tc_action *p) { return __tcf_action_put(p, false); } static void tcf_action_put_many(struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (tcf_action_put(a)) module_put(ops->owner); } } static void tca_put_bound_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; if (init_res[i] == ACT_P_CREATED) continue; if (tcf_action_put(a)) module_put(ops->owner); } } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { return a->ops->dump(skb, a, bind, ref); } int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; struct nlattr *nest; tcf_act_for_each_action(i, a, actions) { nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; err = terse ? tcf_action_dump_terse(skb, a, false) : tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); } return 0; nla_put_failure: err = -EINVAL; errout: nla_nest_cancel(skb, nest); return err; } static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return NULL; c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); if (!c->data) { kfree(c); return NULL; } c->len = nla_len(tb[TCA_ACT_COOKIE]); return c; } static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) { struct nla_bitfield32 hw_stats_bf; /* If the user did not pass the attr, that means he does * not care about the type. Return "any" in that case * which is setting on all supported types. */ if (!hw_stats_attr) return TCA_ACT_HW_STATS_ANY; hw_stats_bf = nla_get_bitfield32(hw_stats_attr); return hw_stats_bf.value; } static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS | TCA_ACT_FLAGS_SKIP_HW | TCA_ACT_FLAGS_SKIP_SW), [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { struct tcf_idrinfo *idrinfo; if (init_res[i] == ACT_P_BOUND) continue; idrinfo = a->idrinfo; mutex_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ idr_replace(&idrinfo->action_idr, a, a->tcfa_index); mutex_unlock(&idrinfo->lock); } } struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; struct nlattr *kind; int err; if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); return ERR_PTR(err); } if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(err); } } else { if (strscpy(act_name, "police", IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } } a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); if (rtnl_held) rtnl_unlock(); request_module(NET_ACT_ALIAS_PREFIX "%s", act_name); if (rtnl_held) rtnl_lock(); a_o = tc_lookup_action_n(act_name); /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. */ if (a_o != NULL) { module_put(a_o->owner); return ERR_PTR(-EAGAIN); } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); return ERR_PTR(-ENOENT); } return a_o; } struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action_ops *a_o, int *init_res, u32 flags, struct netlink_ext_ack *extack) { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nla_bitfield32 userflags = { 0, 0 }; struct tc_cookie *user_cookie = NULL; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; int err; /* backward compatibility for policer */ if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) return ERR_PTR(err); if (tb[TCA_ACT_COOKIE]) { user_cookie = nla_memdup_cookie(tb); if (!user_cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) { userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); if (!tc_act_flags_valid(userflags.value)) { err = -EINVAL; goto err_out; } } err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp, userflags.value | flags, extack); } else { err = a_o->init(net, nla, est, &a, tp, userflags.value | flags, extack); } if (err < 0) goto err_out; *init_res = err; if (!police && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->user_cookie, user_cookie); if (!police) a->hw_stats = hw_stats; return a; err_out: if (user_cookie) { kfree(user_cookie->data); kfree(user_cookie); } return ERR_PTR(err); } static bool tc_act_bind(u32 flags) { return !!(flags & TCA_ACT_FLAGS_BIND); } /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; /* The nested attributes are parsed as types, but they are really an * array of actions. So we parse one more than we can handle, and return * an error if the last one is set (as that indicates that the request * contained more than the maximum number of actions). */ if (tb[TCA_ACT_MAX_PRIO + 1]) { NL_SET_ERR_MSG_FMT(extack, "Only %d actions supported per filter", TCA_ACT_MAX_PRIO); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; a_o = tc_action_load_ops(tb[i], flags, extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; } ops[i - 1] = a_o; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1], &init_res[i - 1], flags, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; if (tc_act_bind(flags)) { bool skip_sw = tc_skip_sw(fl_flags); bool skip_hw = tc_skip_hw(fl_flags); if (tc_act_bind(act->tcfa_flags)) { /* Action is created by classifier and is not * standalone. Check that the user did not set * any action flags different than the * classifier flags, and inherit the flags from * the classifier for the compatibility case * where no flags were specified at all. */ if ((tc_act_skip_sw(act->tcfa_flags) && !skip_sw) || (tc_act_skip_hw(act->tcfa_flags) && !skip_hw)) { NL_SET_ERR_MSG(extack, "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } if (skip_sw) act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_SW; if (skip_hw) act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_HW; continue; } /* Action is standalone */ if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { NL_SET_ERR_MSG(extack, "Mismatch between action and filter offload flags"); err = -EINVAL; goto err; } } else { err = tcf_action_offload_add(act, extack); if (tc_act_skip_sw(act->tcfa_flags) && err) goto err; } } /* We have to commit them all together, because if any error happened in * between, we could not handle the failure gracefully. */ tcf_idr_insert_many(actions, init_res); *attr_size = tcf_action_full_attrs_size(sz); err = i - 1; goto err_mod; err: tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: for (i = 0; i < TCA_ACT_MAX_PRIO && ops[i]; i++) module_put(ops[i]->owner); return err; } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), bytes, packets); return; } _bstats_update(&a->tcfa_bstats, bytes, packets); a->tcfa_qstats.drops += drops; if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) goto errout; return 0; errout: return -1; } static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_nlmsg_trim; t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; if (extack && extack->_msg && nla_put_string(skb, TCA_ROOT_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } return rtnl_unicast(skb, net, portid); } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct tc_action *a; int index; int err; err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (!ops) { /* could happen in batch of actions */ NL_SET_ERR_MSG(extack, "Specified TC action kind not found"); goto err_out; } err = -ENOENT; if (__tcf_idr_search(net, ops, &a, index) == 0) { NL_SET_ERR_MSG(extack, "TC action with specified index not found"); goto err_mod; } module_put(ops->owner); return a; err_mod: module_put(ops->owner); err_out: return ERR_PTR(err); } static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct nlattr *kind; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return err; b = skb_tail_pointer(skb); err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); if (!ops) { /*some idjot trying to flush unknown action */ NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); if (!nlh) { NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (!nest) { NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; } err = __tcf_generic_walker(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; } nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; out_module_put: module_put(ops->owner); err_out: kfree_skb(skb); return err; } static int tcf_action_delete(struct net *net, struct tc_action *actions[]) { struct tc_action *a; int i; tcf_act_for_each_action(i, a, actions) { const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their * type and id to search again after reference is released. */ struct tcf_idrinfo *idrinfo = a->idrinfo; u32 act_index = a->tcfa_index; actions[i] = NULL; if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { int ret; /* now do the delete */ ret = tcf_idr_delete_index(idrinfo, act_index); if (ret < 0) return ret; } } return 0; } static struct sk_buff *tcf_reoffload_del_notify_msg(struct net *net, struct tc_action *action) { size_t attr_size = tcf_action_fill_size(action); struct tc_action *actions[TCA_ACT_MAX_PRIO] = { [0] = action, }; struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_reoffload_del_notify(struct net *net, struct tc_action *action) { const struct tc_action_ops *ops = action->ops; struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, 0, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_reoffload_del_notify_msg(net, action); if (IS_ERR(skb)) return PTR_ERR(skb); } ret = tcf_idr_release_unsafe(action); if (ret == ACT_P_DELETED) { module_put(ops->owner); ret = rtnetlink_maybe_send(skb, net, 0, RTNLGRP_TC, 0); } else { kfree_skb(skb); } return ret; } int tcf_action_reoffload_cb(flow_indr_block_bind_cb_t *cb, void *cb_priv, bool add) { struct tc_act_pernet_id *id_ptr; struct tcf_idrinfo *idrinfo; struct tc_action_net *tn; struct tc_action *p; unsigned int act_id; unsigned long tmp; unsigned long id; struct idr *idr; struct net *net; int ret; if (!cb) return -EINVAL; down_read(&net_rwsem); mutex_lock(&act_id_mutex); for_each_net(net) { list_for_each_entry(id_ptr, &act_pernet_id_list, list) { act_id = id_ptr->id; tn = net_generic(net, act_id); if (!tn) continue; idrinfo = tn->idrinfo; if (!idrinfo) continue; mutex_lock(&idrinfo->lock); idr = &idrinfo->action_idr; idr_for_each_entry_ul(idr, p, tmp, id) { if (IS_ERR(p) || tc_act_bind(p->tcfa_flags)) continue; if (add) { tcf_action_offload_add_ex(p, NULL, cb, cb_priv); continue; } /* cb unregister to update hw count */ ret = tcf_action_offload_del_ex(p, cb, cb_priv); if (ret < 0) continue; if (tc_act_skip_sw(p->tcfa_flags) && !tc_act_in_hw(p)) tcf_reoffload_del_notify(net, p); } mutex_unlock(&idrinfo->lock); } } mutex_unlock(&act_id_mutex); up_read(&net_rwsem); return 0; } static struct sk_buff *tcf_del_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int ret; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_del_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } /* now do the delete */ ret = tcf_action_delete(net, actions); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; ret = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1]) return tca_action_flush(net, tb[1], n, portid, extack); NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } attr_size += tcf_action_fill_size(act); actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack); if (ret) goto err; return 0; } err: tcf_action_put_many(actions); return ret; } static struct sk_buff *tcf_add_notify_msg(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(max(attr_size, NLMSG_GOODSIZE), GFP_KERNEL); if (!skb) return ERR_PTR(-ENOBUFS); if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return ERR_PTR(-EINVAL); } return skb; } static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = NULL; } else { skb = tcf_add_notify_msg(net, n, actions, portid, attr_size, extack); if (IS_ERR(skb)) return PTR_ERR(skb); } return rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, u32 flags, struct netlink_ext_ack *extack) { size_t attr_size = 0; int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; int init_res[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res, &attr_size, flags, 0, extack); if (ret != -EAGAIN) break; } if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); /* only put bound actions */ tca_put_bound_many(actions, init_res); return ret; } static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAG_LARGE_DUMP_ON | TCA_ACT_FLAG_TERSE_DUMP), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; u32 flags = 0; int ret = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; if (tca[TCA_ACT_TAB] == NULL) { NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags * imply create only if it doesn't exist * Note that CREATE | EXCL implies that * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) flags = TCA_ACT_FLAGS_REPLACE; ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags, extack); break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_GETACTION, extack); break; default: BUG(); } return ret; } static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct nlattr *kind; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; if (nla_parse_deprecated(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; if (nla_parse_nested_deprecated(tb2, TCA_ACT_MAX, tb[1], tcf_action_policy, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; return kind; } static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *tb[TCA_ROOT_MAX + 1]; struct nlattr *count_attr = NULL; unsigned long jiffy_since = 0; struct nlattr *kind = NULL; struct nla_bitfield32 bf; u32 msecs_since = 0; u32 act_count = 0; ret = nlmsg_parse_deprecated(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, tcaa_policy, cb->extack); if (ret < 0) return ret; kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; } a_o = tc_lookup_action(kind); if (a_o == NULL) return 0; cb->args[2] = 0; if (tb[TCA_ROOT_FLAGS]) { bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); cb->args[2] = bf.value; } if (tb[TCA_ROOT_TIME_DELTA]) { msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; if (msecs_since) jiffy_since = jiffies - msecs_to_jiffies(msecs_since); t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; cb->args[3] = jiffy_since; count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); if (!count_attr) goto out_module_put; nest = nla_nest_start_noflag(skb, TCA_ACT_TAB); if (nest == NULL) goto out_module_put; ret = __tcf_generic_walker(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; act_count = cb->args[1]; memcpy(nla_data(count_attr), &act_count, sizeof(u32)); cb->args[1] = 0; } else nlmsg_trim(skb, b); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; out_module_put: module_put(a_o->owner); nlmsg_trim(skb, b); return skb->len; } static const struct rtnl_msg_handler tc_action_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_NEWACTION, .doit = tc_ctl_action}, {.msgtype = RTM_DELACTION, .doit = tc_ctl_action}, {.msgtype = RTM_GETACTION, .doit = tc_ctl_action, .dumpit = tc_dump_action}, }; static int __init tc_action_init(void) { rtnl_register_many(tc_action_rtnl_msg_handlers); return 0; } subsys_initcall(tc_action_init);
37 3 12 4 6 7 6 6 2 2 2 23 6 9 8 23 17 5 6 5 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); nft_data_copy(&regs->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 }, [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) { enum nft_data_types type; u8 reg; reg = ntohl(nla_get_be32(nla)); if (reg == NFT_REG_VERDICT) type = NFT_DATA_VERDICT; else type = NFT_DATA_VALUE; return type; } static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc = { .size = sizeof(priv->data), }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; priv->dlen = desc.len; err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], &priv->dreg, &priv->data, desc.type, desc.len); if (err < 0) goto err1; if (priv->dreg == NFT_REG_VERDICT) { struct nft_chain *chain = priv->data.verdict.chain; switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: err = nf_tables_bind_chain(ctx, chain); if (err < 0) goto err1; break; default: break; } } return 0; err1: nft_data_release(&priv->data, desc.type); return err; } static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_ctx chain_ctx; struct nft_chain *chain; struct nft_rule *rule; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_activate(&chain_ctx, rule); nft_clear(ctx->net, chain); break; default: break; } } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, struct nft_chain *chain, enum nft_trans_phase phase) { struct nft_ctx chain_ctx; struct nft_rule *rule; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_deactivate(&chain_ctx, rule, phase); } static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_chain *chain; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; switch (phase) { case NFT_TRANS_PREPARE_ERROR: nf_tables_unbind_chain(ctx, chain); nft_deactivate_next(ctx->net, chain); break; case NFT_TRANS_PREPARE: nft_immediate_chain_deactivate(ctx, chain, phase); nft_deactivate_next(ctx->net, chain); break; default: nft_immediate_chain_deactivate(ctx, chain, phase); nft_chain_del(chain); chain->bound = false; nft_use_dec(&chain->table->use); break; } break; default: break; } } if (phase == NFT_TRANS_COMMIT) return; return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_rule *rule, *n; struct nft_ctx chain_ctx; struct nft_chain *chain; if (priv->dreg != NFT_REG_VERDICT) return; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; /* Rule construction failed, but chain is already bound: * let the transaction records release this chain and its rules. */ if (chain->bound) { nft_use_dec(&chain->use); break; } /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; nft_use_dec(&chain->use); list_for_each_entry_safe(rule, n, &chain->rules, list) { nft_use_dec(&chain->use); list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } nf_tables_chain_destroy(chain); break; default: break; } } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_IMMEDIATE_DREG, priv->dreg)) goto nla_put_failure; return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, nft_dreg_to_type(priv->dreg), priv->dlen); nla_put_failure: return -1; } static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; if (priv->dreg != NFT_REG_VERDICT) return 0; data = &priv->data; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; pctx->level--; break; default: break; } return 0; } static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_immediate_expr *priv) { struct flow_action_entry *entry; const struct nft_data *data; entry = &flow->rule->action.entries[ctx->num_actions++]; data = &priv->data; switch (data->verdict.code) { case NF_ACCEPT: entry->id = FLOW_ACTION_ACCEPT; break; case NF_DROP: entry->id = FLOW_ACTION_DROP; break; default: return -EOPNOTSUPP; } return 0; } static int nft_immediate_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return nft_immediate_offload_verdict(ctx, flow, priv); memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data)); return 0; } static bool nft_immediate_offload_action(const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return true; return false; } static bool nft_immediate_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg != NFT_REG_VERDICT) nft_reg_track_cancel(track, priv->dreg, priv->dlen); return false; } static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), .eval = nft_immediate_eval, .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .reduce = nft_immediate_reduce, .offload = nft_immediate_offload, .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { .name = "immediate", .ops = &nft_imm_ops, .policy = nft_immediate_policy, .maxattr = NFTA_IMMEDIATE_MAX, .owner = THIS_MODULE, };
2180 2063 260 107 2168 170 20 2184 111 1 5 5 42 14 15 13 9 6 1 142 287 287 75 213 216 88 9 7 8 7 4 285 286 285 286 286 10 10 8 2 286 210 210 114 142 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/llc.h> #include <net/llc.h> #include <net/stp.h> #include <net/switchdev.h> #include "br_private.h" /* * Handle changes in state of network devices enslaved to a bridge. * * Note: don't care about up/down if bridge itself is down, because * port state is checked when bridge is brought up. */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool notified = false; bool changed_addr; int err; if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ err = br_sysfs_addbr(dev); if (err) return notifier_from_errno(err); return NOTIFY_DONE; } } if (is_vlan_dev(dev)) { struct net_device *real_dev = vlan_dev_real_dev(dev); if (netif_is_bridge_master(real_dev)) br_vlan_vlan_upper_event(real_dev, dev, event); } /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) return NOTIFY_DONE; br = p->br; switch (event) { case NETDEV_CHANGEMTU: br_mtu_auto_adjust(br); break; case NETDEV_PRE_CHANGEADDR: if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; err = dev_pre_changeaddr_notify(br->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); break; case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); break; case NETDEV_CHANGE: br_port_carrier_check(p, &notified); break; case NETDEV_FEAT_CHANGE: netdev_update_features(br->dev); break; case NETDEV_DOWN: spin_lock_bh(&br->lock); if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); notified = true; } spin_unlock_bh(&br->lock); break; case NETDEV_UP: if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); notified = true; spin_unlock_bh(&br->lock); } break; case NETDEV_UNREGISTER: br_del_if(br, dev); break; case NETDEV_CHANGENAME: err = br_sysfs_renameif(p); if (err) return notifier_from_errno(err); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, br->dev); break; } if (event != NETDEV_UNREGISTER) br_vlan_port_event(p, event); /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; /* called with RTNL or RCU */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; p = br_port_get_rtnl_rcu(dev); if (!p) goto out; br = p->br; switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid, fdb_info->locked, false); if (err) { err = notifier_from_errno(err); break; } br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: fdb_info = ptr; /* Don't delete static entries */ br_fdb_delete_by_port(br, p, fdb_info->vid, 0); break; } out: return err; } static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; /* called under rtnl_mutex */ static int br_switchdev_blocking_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_brport_info *brport_info; const struct switchdev_brport *b; struct net_bridge_port *p; int err = NOTIFY_DONE; p = br_port_get_rtnl(dev); if (!p) goto out; switch (event) { case SWITCHDEV_BRPORT_OFFLOADED: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_offload(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, b->tx_fwd_offload, extack); err = notifier_from_errno(err); break; case SWITCHDEV_BRPORT_UNOFFLOADED: brport_info = ptr; b = &brport_info->brport; br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; case SWITCHDEV_BRPORT_REPLAY: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, extack); err = notifier_from_errno(err); break; } out: return err; } static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device * @opt: id of the option to change * @on: new option value * @extack: extack for error messages * * Changes the value of the respective boolean option to @on taking care of * any internal option value mapping and configuration. */ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { int err = 0; switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) { switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return 0; } int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack) { unsigned long bitmap = bm->optmask; int err = 0; int opt_id; for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); err = br_boolopt_toggle(br, opt_id, on, extack); if (err) { br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", opt_id, br_boolopt_get(br, opt_id), on, err); break; } } return err; } void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm) { u32 optval = 0; int opt_id; for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) optval |= (br_boolopt_get(br, opt_id) << opt_id); bm->optval = optval; bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); } /* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); br_debug(br, "toggle option: %d state: %d -> %d\n", opt, cur, on); if (cur == on) return; if (on) set_bit(opt, &br->options); else clear_bit(opt, &br->options); } static void __net_exit br_net_exit_rtnl(struct net *net, struct list_head *dev_to_kill) { struct net_device *dev; ASSERT_RTNL_NET(net); for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { .exit_rtnl = br_net_exit_rtnl, }; static const struct stp_proto br_stp_proto = { .rcv = br_stp_rcv, }; static int __init br_init(void) { int err; BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; err = br_nf_core_init(); if (err) goto err_out2; err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; err = br_netlink_init(); if (err) goto err_out6; brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif #if IS_MODULE(CONFIG_BRIDGE_NETFILTER) pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " "by default. Update your scripts to load br_netfilter if you " "need this.\n"); #endif return 0; err_out6: unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: br_fdb_fini(); err_out: stp_proto_unregister(&br_stp_proto); return err; } static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif br_fdb_fini(); } module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge"); MODULE_DESCRIPTION("Ethernet bridge driver");
1 1 13 11 2 8 73 137 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SKCIPHER_H #define _CRYPTO_SKCIPHER_H #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> /* Set this bit if the lskcipher operation is a continuation. */ #define CRYPTO_LSKCIPHER_FLAG_CONT 0x00000001 /* Set this bit if the lskcipher operation is final. */ #define CRYPTO_LSKCIPHER_FLAG_FINAL 0x00000002 /* The bit CRYPTO_TFM_REQ_MAY_SLEEP can also be set if needed. */ /* Set this bit if the skcipher operation is a continuation. */ #define CRYPTO_SKCIPHER_REQ_CONT 0x00000001 /* Set this bit if the skcipher operation is not final. */ #define CRYPTO_SKCIPHER_REQ_NOTFINAL 0x00000002 struct scatterlist; /** * struct skcipher_request - Symmetric key cipher request * @cryptlen: Number of bytes to encrypt or decrypt * @iv: Initialisation Vector * @src: Source SG list * @dst: Destination SG list * @base: Underlying async request * @__ctx: Start of private context data */ struct skcipher_request { unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; struct crypto_async_request base; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_skcipher { unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_skcipher { struct crypto_skcipher base; }; struct crypto_lskcipher { struct crypto_tfm base; }; /* * struct skcipher_alg_common - common properties of skcipher_alg * @min_keysize: Minimum key size supported by the transformation. This is the * smallest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MIN_KEY_SIZE" include/crypto/ * @max_keysize: Maximum key size supported by the transformation. This is the * largest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MAX_KEY_SIZE" include/crypto/ * @ivsize: IV size applicable for transformation. The consumer must provide an * IV of exactly that size to perform the encrypt or decrypt operation. * @chunksize: Equal to the block size except for stream ciphers such as * CTR where it is set to the underlying block size. * @statesize: Size of the internal state for the algorithm. * @base: Definition of a generic crypto algorithm. */ #define SKCIPHER_ALG_COMMON { \ unsigned int min_keysize; \ unsigned int max_keysize; \ unsigned int ivsize; \ unsigned int chunksize; \ unsigned int statesize; \ \ struct crypto_alg base; \ } struct skcipher_alg_common SKCIPHER_ALG_COMMON; /** * struct skcipher_alg - symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt * the supplied scatterlist containing the blocks of data. The crypto * API consumer is responsible for aligning the entries of the * scatterlist properly and making sure the chunks are correctly * sized. In case a software fallback was put in place in the * @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. In case the * key was stored in transformation context, the key might need to be * re-programmed into the hardware in this function. This function * shall not modify the transformation context, as this function may * be called in parallel with the same transformation object. * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt * and the conditions are exactly the same. * @export: Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * @import: Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @walksize: Equal to the chunk size except in cases where the algorithm is * considerably more efficient if it can operate on multiple chunks * in parallel. Should be a multiple of chunksize. * @co: see struct skcipher_alg_common * * All fields except @ivsize are mandatory and must be filled. */ struct skcipher_alg { int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct skcipher_request *req); int (*decrypt)(struct skcipher_request *req); int (*export)(struct skcipher_request *req, void *out); int (*import)(struct skcipher_request *req, const void *in); int (*init)(struct crypto_skcipher *tfm); void (*exit)(struct crypto_skcipher *tfm); unsigned int walksize; union { struct SKCIPHER_ALG_COMMON; struct skcipher_alg_common co; }; }; /** * struct lskcipher_alg - linear symmetric key cipher definition * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a number of bytes. This function is used to encrypt * the supplied data. This function shall not modify * the transformation context, as this function may be called * in parallel with the same transformation object. Data * may be left over if length is not a multiple of blocks * and there is more to come (final == false). The number of * left-over bytes should be returned in case of success. * The siv field shall be as long as ivsize + statesize with * the IV placed at the front. The state will be used by the * algorithm internally. * @decrypt: Decrypt a number of bytes. This is a reverse counterpart to * @encrypt and the conditions are exactly the same. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @co: see struct skcipher_alg_common */ struct lskcipher_alg { int (*setkey)(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*decrypt)(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv, u32 flags); int (*init)(struct crypto_lskcipher *tfm); void (*exit)(struct crypto_lskcipher *tfm); struct skcipher_alg_common co; }; #define MAX_SYNC_SKCIPHER_REQSIZE 384 /* * This performs a type-check against the "_tfm" argument to make sure * all users have the correct skcipher tfm for doing on-stack requests. */ #define SYNC_SKCIPHER_REQUEST_ON_STACK(name, _tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ MAX_SYNC_SKCIPHER_REQSIZE \ ] CRYPTO_MINALIGN_ATTR; \ struct skcipher_request *name = \ (((struct skcipher_request *)__##name##_desc)->base.tfm = \ crypto_sync_skcipher_tfm((_tfm)), \ (void *)__##name##_desc) /** * DOC: Symmetric Key Cipher API * * Symmetric key cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_SKCIPHER (listed as type "skcipher" in /proc/crypto). * * Asynchronous cipher operations imply that the function invocation for a * cipher request returns immediately before the completion of the operation. * The cipher request is scheduled as a separate kernel thread and therefore * load-balanced on the different CPUs via the process scheduler. To allow * the kernel crypto API to inform the caller about the completion of a cipher * request, the caller must provide a callback function. That function is * invoked with the cipher handle when the request completes. * * To support the asynchronous operation, additional information than just the * cipher handle must be supplied to the kernel crypto API. That additional * information is given by filling in the skcipher_request data structure. * * For the symmetric key cipher API, the state is maintained with the tfm * cipher handle. A single tfm can be used across multiple calls and in * parallel. For asynchronous block cipher calls, context data supplied and * only used by the caller can be referenced the request data structure in * addition to the IV used for the cipher request. The maintenance of such * state information would be important for a crypto driver implementer to * have, because when calling the callback function upon completion of the * cipher operation, that callback function may need some information about * which operation just finished if it invoked multiple in parallel. This * state information is unused by the kernel crypto API. */ static inline struct crypto_skcipher *__crypto_skcipher_cast( struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_skcipher, base); } /** * crypto_alloc_skcipher() - allocate symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an skcipher. The returned struct * crypto_skcipher is the cipher handle that is required for any subsequent * API invocation for that skcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name, u32 type, u32 mask); /** * crypto_alloc_lskcipher() - allocate linear symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * lskcipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an lskcipher. The returned struct * crypto_lskcipher is the cipher handle that is required for any subsequent * API invocation for that lskcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_lskcipher *crypto_alloc_lskcipher(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_skcipher_tfm( struct crypto_skcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_lskcipher_tfm( struct crypto_lskcipher *tfm) { return &tfm->base; } static inline struct crypto_tfm *crypto_sync_skcipher_tfm( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_tfm(&tfm->base); } /** * crypto_free_skcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_skcipher(struct crypto_skcipher *tfm) { crypto_destroy_tfm(tfm, crypto_skcipher_tfm(tfm)); } static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm) { crypto_free_skcipher(&tfm->base); } /** * crypto_free_lskcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_lskcipher(struct crypto_lskcipher *tfm) { crypto_destroy_tfm(tfm, crypto_lskcipher_tfm(tfm)); } /** * crypto_has_skcipher() - Search for the availability of an skcipher. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher * @type: specifies the type of the skcipher * @mask: specifies the mask for the skcipher * * Return: true when the skcipher is known to the kernel crypto API; false * otherwise */ int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm)); } static inline const char *crypto_lskcipher_driver_name( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_lskcipher_tfm(tfm)); } static inline struct skcipher_alg_common *crypto_skcipher_alg_common( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg_common, base); } static inline struct skcipher_alg *crypto_skcipher_alg( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg, base); } static inline struct lskcipher_alg *crypto_lskcipher_alg( struct crypto_lskcipher *tfm) { return container_of(crypto_lskcipher_tfm(tfm)->__crt_alg, struct lskcipher_alg, co.base); } /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the skcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->ivsize; } static inline unsigned int crypto_sync_skcipher_ivsize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_ivsize(&tfm->base); } /** * crypto_lskcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the lskcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_lskcipher_ivsize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.ivsize; } /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the skcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_skcipher_blocksize( struct crypto_skcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm)); } /** * crypto_lskcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the lskcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_lskcipher_blocksize( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_lskcipher_tfm(tfm)); } /** * crypto_skcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_skcipher_chunksize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->chunksize; } /** * crypto_lskcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_lskcipher_chunksize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.chunksize; } /** * crypto_skcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_skcipher_statesize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->statesize; } /** * crypto_lskcipher_statesize() - obtain state size * @tfm: cipher handle * * Some algorithms cannot be chained with the IV alone. They carry * internal state which must be replicated if data is to be processed * incrementally. The size of that state can be obtained with this * function. * * Return: state size in bytes */ static inline unsigned int crypto_lskcipher_statesize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.statesize; } static inline unsigned int crypto_sync_skcipher_blocksize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_blocksize(&tfm->base); } static inline unsigned int crypto_skcipher_alignmask( struct crypto_skcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm)); } static inline unsigned int crypto_lskcipher_alignmask( struct crypto_lskcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_lskcipher_tfm(tfm)); } static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm) { return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm)); } static inline void crypto_skcipher_set_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_skcipher_tfm(tfm), flags); } static inline void crypto_skcipher_clear_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_skcipher_tfm(tfm), flags); } static inline u32 crypto_sync_skcipher_get_flags( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_get_flags(&tfm->base); } static inline void crypto_sync_skcipher_set_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_set_flags(&tfm->base, flags); } static inline void crypto_sync_skcipher_clear_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_clear_flags(&tfm->base, flags); } static inline u32 crypto_lskcipher_get_flags(struct crypto_lskcipher *tfm) { return crypto_tfm_get_flags(crypto_lskcipher_tfm(tfm)); } static inline void crypto_lskcipher_set_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_lskcipher_tfm(tfm), flags); } static inline void crypto_lskcipher_clear_flags(struct crypto_lskcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_lskcipher_tfm(tfm), flags); } /** * crypto_skcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the skcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm, const u8 *key, unsigned int keylen) { return crypto_skcipher_setkey(&tfm->base, key, keylen); } /** * crypto_lskcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the lskcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_lskcipher_setkey(struct crypto_lskcipher *tfm, const u8 *key, unsigned int keylen); static inline unsigned int crypto_skcipher_min_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->min_keysize; } static inline unsigned int crypto_skcipher_max_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_common(tfm)->max_keysize; } static inline unsigned int crypto_lskcipher_min_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.min_keysize; } static inline unsigned int crypto_lskcipher_max_keysize( struct crypto_lskcipher *tfm) { return crypto_lskcipher_alg(tfm)->co.max_keysize; } /** * crypto_skcipher_reqtfm() - obtain cipher handle from request * @req: skcipher_request out of which the cipher handle is to be obtained * * Return the crypto_skcipher handle when furnishing an skcipher_request * data structure. * * Return: crypto_skcipher handle */ static inline struct crypto_skcipher *crypto_skcipher_reqtfm( struct skcipher_request *req) { return __crypto_skcipher_cast(req->base.tfm); } static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm( struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); return container_of(tfm, struct crypto_sync_skcipher, base); } /** * crypto_skcipher_encrypt() - encrypt plaintext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_encrypt(struct skcipher_request *req); /** * crypto_skcipher_decrypt() - decrypt ciphertext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_decrypt(struct skcipher_request *req); /** * crypto_skcipher_export() - export partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @out: output buffer of sufficient size that can hold the state * * Export partial state of the transformation. This function dumps the * entire state of the ongoing transformation into a provided block of * data so it can be @import 'ed back later on. This is useful in case * you want to save partial result of the transformation after * processing certain amount of data and reload this partial result * multiple times later on for multiple re-use. No data processing * happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_export(struct skcipher_request *req, void *out); /** * crypto_skcipher_import() - import partial state * @req: reference to the skcipher_request handle that holds all information * needed to perform the operation * @in: buffer holding the state * * Import partial state of the transformation. This function loads the * entire state of the ongoing transformation from a provided block of * data so the transformation can continue from this point onward. No * data processing happens at this point. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_import(struct skcipher_request *req, const void *in); /** * crypto_lskcipher_encrypt() - encrypt plaintext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * Encrypt plaintext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * crypto_lskcipher_decrypt() - decrypt ciphertext * @tfm: lskcipher handle * @src: source buffer * @dst: destination buffer * @len: number of bytes to process * @siv: IV + state for the cipher operation. The length of the IV must * comply with the IV size defined by crypto_lskcipher_ivsize. The * IV is then followed with a buffer with the length as specified by * crypto_lskcipher_statesize. * * Decrypt ciphertext data using the lskcipher handle. * * Return: >=0 if the cipher operation was successful, if positive * then this many bytes have been left unprocessed; * < 0 if an error occurred */ int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src, u8 *dst, unsigned len, u8 *siv); /** * DOC: Symmetric Key Cipher Request Handle * * The skcipher_request data structure contains all pointers to data * required for the symmetric key cipher operation. This includes the cipher * handle (which can be used by multiple skcipher_request instances), pointer * to plaintext and ciphertext, asynchronous callback function, etc. It acts * as a handle to the skcipher_request_* API calls in a similar way as * skcipher handle to the crypto_skcipher_* API calls. */ /** * crypto_skcipher_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_skcipher_reqsize(struct crypto_skcipher *tfm) { return tfm->reqsize; } /** * skcipher_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing skcipher handle in the request * data structure with a different one. */ static inline void skcipher_request_set_tfm(struct skcipher_request *req, struct crypto_skcipher *tfm) { req->base.tfm = crypto_skcipher_tfm(tfm); } static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req, struct crypto_sync_skcipher *tfm) { skcipher_request_set_tfm(req, &tfm->base); } static inline struct skcipher_request *skcipher_request_cast( struct crypto_async_request *req) { return container_of(req, struct skcipher_request, base); } /** * skcipher_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the skcipher * encrypt and decrypt API calls. During the allocation, the provided skcipher * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct skcipher_request *skcipher_request_alloc_noprof( struct crypto_skcipher *tfm, gfp_t gfp) { struct skcipher_request *req; req = kmalloc_noprof(sizeof(struct skcipher_request) + crypto_skcipher_reqsize(tfm), gfp); if (likely(req)) skcipher_request_set_tfm(req, tfm); return req; } #define skcipher_request_alloc(...) alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__)) /** * skcipher_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void skcipher_request_free(struct skcipher_request *req) { kfree_sensitive(req); } static inline void skcipher_request_zero(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); memzero_explicit(req, sizeof(*req) + crypto_skcipher_reqsize(tfm)); } /** * skcipher_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once the * cipher operation completes. * * The callback function is registered with the skcipher_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void skcipher_request_set_callback(struct skcipher_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * skcipher_request_set_crypt() - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_skcipher_ivsize * * This function allows setting of the source data and destination data * scatter / gather lists. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. */ static inline void skcipher_request_set_crypt( struct skcipher_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, void *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } #endif /* _CRYPTO_SKCIPHER_H */
49 49 49 49 49 49 4 45 45 45 49 49 49 49 49 49 49 95 95 95 94 95 95 95 94 82 13 95 13 13 13 12 13 12 14 14 14 14 14 14 17 17 17 17 17 17 17 17 7 17 17 1359 1357 14 8 8 8 15 644 643 598 216 644 160 17 7 10 17 158 5 5 5 5 5 5 17 51 104 54 54 54 54 54 63 63 54 45 45 45 63 50 51 51 51 4 51 39 39 39 39 8 2 8 8 2 2 8 8 10 10 2 2 8 8 8 10 10 10 2 8 10 958 944 25 1080 1074 8 1 14 4 961 30 30 160 825 1076 1080 281 959 958 958 957 839 956 959 1076 164 165 165 165 165 123 162 162 4 123 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/filemap.c * * Copyright (C) 1994-1999 Linus Torvalds */ /* * This file handles the generic file mmap semantics used by * most "normal" filesystems (but you don't /have/ to use this: * the NFS filesystem used to do this differently, for example) */ #include <linux/export.h> #include <linux/compiler.h> #include <linux/dax.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/capability.h> #include <linux/kernel_stat.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/security.h> #include <linux/cpuset.h> #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> #include <linux/migrate.h> #include <linux/pipe_fs_i.h> #include <linux/splice.h> #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <linux/sysctl.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/filemap.h> /* * FIXME: remove all knowledge of the buffer layer from the core VM */ #include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> #include "swap.h" /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. * * Shared mappings now work. 15.8.1995 Bruno. * * finished 'unifying' the page and buffer cache and SMP-threaded the * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> * * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> */ /* * Lock ordering: * * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->block_dirty_folio) * ->swap_lock (exclusive_swap_page, others) * ->i_pages lock * * ->i_rwsem * ->invalidate_lock (acquired by fs in truncate path) * ->i_mmap_rwsem (truncate->unmap_mapping_range) * * ->mmap_lock * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_lock * ->invalidate_lock (filemap_fault) * ->lock_page (filemap_fault, access_process_vm) * * ->i_rwsem (generic_perform_write) * ->mmap_lock (fault_in_readable->do_page_fault) * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_merge) * * ->anon_vma.lock * ->page_table_lock or pte_lock (anon_vma_prepare and various) * * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed) * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru) * ->private_lock (folio_remove_rmap_pte->set_page_dirty) * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) */ static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { XA_STATE(xas, &mapping->i_pages, folio->index); long nr = 1; mapping_set_update(&xas, mapping); xas_set_order(&xas, folio->index, folio_order(folio)); nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); xas_store(&xas, shadow); xas_init_marks(&xas); folio->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages -= nr; } static void filemap_unaccount_folio(struct address_space *mapping, struct folio *folio) { long nr; VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", current->comm, folio_pfn(folio)); dump_page(&folio->page, "still mapped when deleted"); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); if (mapping_exiting(mapping) && !folio_test_large(folio)) { int mapcount = folio_mapcount(folio); if (folio_ref_count(folio) >= mapcount + 2) { /* * All vmas have already been torn down, so it's * a good bet that actually the page is unmapped * and we'd rather not leak it: if we're wrong, * another bad page check should catch it later. */ atomic_set(&folio->_mapcount, -1); folio_ref_sub(folio, mapcount); } } } /* hugetlb folios do not participate in page cache accounting. */ if (folio_test_hugetlb(folio)) return; nr = folio_nr_pages(folio); __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); } /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the i_pages lock. */ void __filemap_remove_folio(struct folio *folio, void *shadow) { struct address_space *mapping = folio->mapping; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); page_cache_delete(mapping, folio, shadow); } void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*free_folio)(struct folio *); free_folio = mapping->a_ops->free_folio; if (free_folio) free_folio(folio); folio_put_refs(folio, folio_nr_pages(folio)); } /** * filemap_remove_folio - Remove folio from page cache. * @folio: The folio. * * This must be called only on folios that are locked and have been * verified to be in the page cache. It will never put the folio into * the free list because the caller has a reference on the page. */ void filemap_remove_folio(struct folio *folio) { struct address_space *mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); } /* * page_cache_delete_batch - delete several folios from page cache * @mapping: the mapping to which folios belong * @fbatch: batch of folios to delete * * The function walks over mapping->i_pages and removes folios passed in * @fbatch from the mapping. The function expects @fbatch to be sorted * by page index and is optimised for it to be dense. * It tolerates holes in @fbatch (mapping entries at those indices are not * modified). * * The function expects the i_pages lock to be held. */ static void page_cache_delete_batch(struct address_space *mapping, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); long total_pages = 0; int i = 0; struct folio *folio; mapping_set_update(&xas, mapping); xas_for_each(&xas, folio, ULONG_MAX) { if (i >= folio_batch_count(fbatch)) break; /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(folio)) continue; /* * A page got inserted in our range? Skip it. We have our * pages locked so they are protected from being removed. * If we see a page whose index is higher than ours, it * means our page has been removed, which shouldn't be * possible because we're holding the PageLock. */ if (folio != fbatch->folios[i]) { VM_BUG_ON_FOLIO(folio->index > fbatch->folios[i]->index, folio); continue; } WARN_ON_ONCE(!folio_test_locked(folio)); folio->mapping = NULL; /* Leave folio->index set: truncation lookup relies on it */ i++; xas_store(&xas, NULL); total_pages += folio_nr_pages(folio); } mapping->nrpages -= total_pages; } void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch) { int i; if (!folio_batch_count(fbatch)) return; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; trace_mm_filemap_delete_from_page_cache(folio); filemap_unaccount_folio(mapping, folio); } page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) filemap_free_folio(mapping, fbatch->folios[i]); } int filemap_check_errors(struct address_space *mapping) { int ret = 0; /* Check for outstanding write errors */ if (test_bit(AS_ENOSPC, &mapping->flags) && test_and_clear_bit(AS_ENOSPC, &mapping->flags)) ret = -ENOSPC; if (test_bit(AS_EIO, &mapping->flags) && test_and_clear_bit(AS_EIO, &mapping->flags)) ret = -EIO; return ret; } EXPORT_SYMBOL(filemap_check_errors); static int filemap_check_and_keep_errors(struct address_space *mapping) { /* Check for outstanding write errors */ if (test_bit(AS_EIO, &mapping->flags)) return -EIO; if (test_bit(AS_ENOSPC, &mapping->flags)) return -ENOSPC; return 0; } /** * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @wbc: the writeback_control controlling the writeout * * Call writepages on the mapping using the provided wbc to control the * writeout. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_wbc(struct address_space *mapping, struct writeback_control *wbc) { int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(wbc, mapping->host); ret = do_writepages(mapping, wbc); wbc_detach_inode(wbc); return ret; } EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as * opposed to a regular memory cleansing writeback. The difference between * these two operations is that if a dirty page/buffer is encountered, it must * be waited upon, and not just skipped over. * * Return: %0 on success, negative error code otherwise. */ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end, int sync_mode) { struct writeback_control wbc = { .sync_mode = sync_mode, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; return filemap_fdatawrite_wbc(mapping, &wbc); } static inline int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) { return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); } int filemap_fdatawrite(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); } EXPORT_SYMBOL(filemap_fdatawrite_range); /** * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. * * Return: %0 on success, negative error code otherwise. */ int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end) { return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); } EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); /** * filemap_flush - mostly a non-blocking flush * @mapping: target address_space * * This is a mostly non-blocking flush. Not suitable for data-integrity * purposes - I/O may not be started against all dirty pages. * * Return: %0 on success, negative error code otherwise. */ int filemap_flush(struct address_space *mapping) { return __filemap_fdatawrite(mapping, WB_SYNC_NONE); } EXPORT_SYMBOL(filemap_flush); /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Find at least one page in the range supplied, usually used to check if * direct writing in this range will trigger a writeback. * * Return: %true if at least one page exists in the specified range, * %false otherwise. */ bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { struct folio *folio; XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; rcu_read_lock(); for (;;) { folio = xas_find(&xas, max); if (xas_retry(&xas, folio)) continue; /* Shadow entries don't count */ if (xa_is_value(folio)) continue; /* * We don't need to try to pin this page; we're about to * release the RCU lock anyway. It is enough to know that * there was a page here recently. */ break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL(filemap_range_has_page); static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct folio_batch fbatch; unsigned nr_folios; folio_batch_init(&fbatch); while (index <= end) { unsigned i; nr_folios = filemap_get_folios_tag(mapping, &index, end, PAGECACHE_TAG_WRITEBACK, &fbatch); if (!nr_folios) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; folio_wait_writeback(folio); } folio_batch_release(&fbatch); cond_resched(); } } /** * filemap_fdatawait_range - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space * in the given range and wait for all of them. Check error status of * the address space and return it. * * Since the error status of the address space is cleared by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space. */ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait_range_keep_errors - wait for writeback to complete * @mapping: address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the given address space in the * given range and wait for all of them. Unlike filemap_fdatawait_range(), * this function does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ int filemap_fdatawait_range_keep_errors(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { __filemap_fdatawait_range(mapping, start_byte, end_byte); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts * @end_byte: offset in bytes where the range ends (inclusive) * * Walk the list of under-writeback pages of the address space that file * refers to, in the given range and wait for all of them. Check error * status of the address space vs. the file->f_wb_err cursor and return it. * * Since the error status of the file is advanced by this function, * callers are responsible for checking the return value and handling and/or * reporting the error. * * Return: error status of the address space vs. the file->f_wb_err cursor. */ int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) { struct address_space *mapping = file->f_mapping; __filemap_fdatawait_range(mapping, start_byte, end_byte); return file_check_and_advance_wb_err(file); } EXPORT_SYMBOL(file_fdatawait_range); /** * filemap_fdatawait_keep_errors - wait for writeback without clearing errors * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space * and wait for all of them. Unlike filemap_fdatawait(), this function * does not clear error status of the address space. * * Use this function if callers don't handle errors themselves. Expected * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) * * Return: error status of the address space. */ int filemap_fdatawait_keep_errors(struct address_space *mapping) { __filemap_fdatawait_range(mapping, 0, LLONG_MAX); return filemap_check_and_keep_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { return mapping->nrpages; } bool filemap_range_has_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) continue; if (folio_test_dirty(folio) || folio_test_locked(folio) || folio_test_writeback(folio)) break; } rcu_read_unlock(); return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * Return: error status of the address space. */ int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { int err = 0, err2; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = filemap_check_errors(mapping); if (!err) err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); void __filemap_set_wb_err(struct address_space *mapping, int err) { errseq_t eseq = errseq_set(&mapping->wb_err, err); trace_filemap_set_wb_err(mapping, eseq); } EXPORT_SYMBOL(__filemap_set_wb_err); /** * file_check_and_advance_wb_err - report wb error (if any) that was previously * and advance wb_err to current one * @file: struct file on which the error is being reported * * When userland calls fsync (or something like nfsd does the equivalent), we * want to report any writeback errors that occurred since the last fsync (or * since the file was opened if there haven't been any). * * Grab the wb_err from the mapping. If it matches what we have in the file, * then just quickly return 0. The file is all caught up. * * If it doesn't match, then take the mapping value, set the "seen" flag in * it and try to swap it into place. If it works, or another task beat us * to it with the new value, then update the f_wb_err and return the error * portion. The error at this point must be reported via proper channels * (a'la fsync, or NFS COMMIT operation, etc.). * * While we handle mapping->wb_err with atomic operations, the f_wb_err * value is protected by the f_lock since we must ensure that it reflects * the latest value swapped in for this file descriptor. * * Return: %0 on success, negative error code otherwise. */ int file_check_and_advance_wb_err(struct file *file) { int err = 0; errseq_t old = READ_ONCE(file->f_wb_err); struct address_space *mapping = file->f_mapping; /* Locklessly handle the common case where nothing has changed */ if (errseq_check(&mapping->wb_err, old)) { /* Something changed, must use slow path */ spin_lock(&file->f_lock); old = file->f_wb_err; err = errseq_check_and_advance(&mapping->wb_err, &file->f_wb_err); trace_file_check_and_advance_wb_err(file, old); spin_unlock(&file->f_lock); } /* * We're mostly using this function as a drop in replacement for * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect * that the legacy code would have had on these flags. */ clear_bit(AS_EIO, &mapping->flags); clear_bit(AS_ENOSPC, &mapping->flags); return err; } EXPORT_SYMBOL(file_check_and_advance_wb_err); /** * file_write_and_wait_range - write out & wait on a file range * @file: file pointing to address_space with pages * @lstart: offset in bytes where the range starts * @lend: offset in bytes where the range ends (inclusive) * * Write out and wait upon file offsets lstart->lend, inclusive. * * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). * * After writing out and waiting on the data, we check and advance the * f_wb_err cursor to the latest value, and return any errors detected there. * * Return: %0 on success, negative error code otherwise. */ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) { int err = 0, err2; struct address_space *mapping = file->f_mapping; if (lend < lstart) return 0; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); } err2 = file_check_and_advance_wb_err(file); if (!err) err = err2; return err; } EXPORT_SYMBOL(file_write_and_wait_range); /** * replace_page_cache_folio - replace a pagecache folio with a new one * @old: folio to be replaced * @new: folio to replace with * * This function replaces a folio in the pagecache with a new one. On * success it acquires the pagecache reference for the new folio and * drops it for the old folio. Both the old and new folios must be * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ void replace_page_cache_folio(struct folio *old, struct folio *new) { struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); VM_BUG_ON_FOLIO(!folio_test_locked(old), old); VM_BUG_ON_FOLIO(!folio_test_locked(new), new); VM_BUG_ON_FOLIO(new->mapping, new); folio_get(new); new->mapping = mapping; new->index = offset; mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) __lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) __lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) __lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); folio_put(old); } EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); bool huge; long nr; unsigned int forder = folio_order(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), folio); mapping_set_update(&xas, mapping); VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); huge = folio_test_hugetlb(folio); nr = folio_nr_pages(folio); gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = xas.xa_index; for (;;) { int order = -1; void *entry, *old = NULL; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; if (!xa_is_value(entry)) { xas_set_err(&xas, -EEXIST); goto unlock; } /* * If a larger entry exists, * it will be the first and only entry iterated. */ if (order == -1) order = xas_get_order(&xas); } if (old) { if (order > 0 && order > forder) { unsigned int split_order = max(forder, xas_try_split_min_order(order)); /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); while (order > forder) { xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, order); if (xas_error(&xas)) goto unlock; order = split_order; split_order = max(xas_try_split_min_order( split_order), forder); } xas_reset(&xas); } if (shadowp) *shadowp = old; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; mapping->nrpages += nr; /* hugetlb pages do not participate in page cache accounting */ if (!huge) { __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) goto error; trace_mm_filemap_add_to_page_cache(folio); return 0; error: folio->mapping = NULL; /* Leave page->index set: truncation relies upon it */ folio_put_refs(folio, nr); return xas_error(&xas); } ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp) { void *shadow = NULL; int ret; ret = mem_cgroup_charge(folio, NULL, gfp); if (ret) return ret; __folio_set_locked(folio); ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); if (unlikely(ret)) { mem_cgroup_uncharge(folio); __folio_clear_locked(folio); } else { /* * The folio might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed folio. * The exception is folios getting rewritten; evicting other * data from the working set, only to cache data that will * get overwritten with something else, is a waste of memory. */ WARN_ON_ONCE(folio_test_active(folio)); if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); } return ret; } EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); folio = __folio_alloc_node_noprof(gfp, order, n); } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); return folio; } return folio_alloc_noprof(gfp, order); } EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* * filemap_invalidate_lock_two - lock invalidate_lock for two mappings * * Lock exclusively invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to lock * @mapping2: the second mapping to lock */ void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1 > mapping2) swap(mapping1, mapping2); if (mapping1) down_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) down_write_nested(&mapping2->invalidate_lock, 1); } EXPORT_SYMBOL(filemap_invalidate_lock_two); /* * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings * * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. * * @mapping1: the first mapping to unlock * @mapping2: the second mapping to unlock */ void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2) { if (mapping1) up_write(&mapping1->invalidate_lock); if (mapping2 && mapping1 != mapping2) up_write(&mapping2->invalidate_lock); } EXPORT_SYMBOL(filemap_invalidate_unlock_two); /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of * waitqueues where the bucket discipline is to maintain all * waiters on the same queue and wake all when any of the pages * become available, and for the woken contexts to check to be * sure the appropriate page became available, this saves space * at a cost of "thundering herd" phenomena during rare hash * collisions. */ #define PAGE_WAIT_TABLE_BITS 8 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; static wait_queue_head_t *folio_waitqueue(struct folio *folio) { return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; } /* How many times do we accept lock stealing from under a waiter? */ static int sysctl_page_lock_unfairness = 5; static const struct ctl_table filemap_sysctl_table[] = { { .procname = "page_lock_unfairness", .data = &sysctl_page_lock_unfairness, .maxlen = sizeof(sysctl_page_lock_unfairness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, } }; void __init pagecache_init(void) { int i; for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) init_waitqueue_head(&folio_wait_table[i]); page_writeback_init(); register_sysctl_init("vm", filemap_sysctl_table); } /* * The page wait code treats the "wait->flags" somewhat unusually, because * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: * * (a) no special bits set: * * We're just waiting for the bit to be released, and when a waker * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, * and remove it from the wait queue. * * Simple and straightforward. * * (b) WQ_FLAG_EXCLUSIVE: * * The waiter is waiting to get the lock, and only one waiter should * be woken up to avoid any thundering herd behavior. We'll set the * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. * * This is the traditional exclusive wait. * * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock * cannot be taken, we stop walking the wait queue without waking * the waiter. * * This is the "fair lock handoff" case, and in addition to setting * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see * that it now has the lock. */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); if (!wake_page_match(wait_page, key)) return 0; /* * If it's a lock handoff wait, we get the bit for it, and * stop walking (and do not wake it up) if we can't. */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { if (test_bit(key->bit_nr, &key->folio->flags)) return -1; if (flags & WQ_FLAG_CUSTOM) { if (test_and_set_bit(key->bit_nr, &key->folio->flags)) return -1; flags |= WQ_FLAG_DONE; } } /* * We are holding the wait-queue lock, but the waiter that * is waiting for this will be checking the flags without * any locking. * * So update the flags atomically, and wake up the waiter * afterwards to avoid any races. This store-release pairs * with the load-acquire in folio_wait_bit_common(). */ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * * Note that this pairs with the "finish_wait()" in the * waiter, and has to be the absolute last thing we do. * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void folio_wake_bit(struct folio *folio, int bit_nr) { wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page * waiters, but the hashed waitqueue has waiters for other pages on it. * That's okay, it's a rare case. The next waker will clear it. * * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, * other), the flag may be cleared in the course of freeing the page; * but that is not required for correctness. */ if (!waitqueue_active(q) || !key.page_match) folio_clear_waiters(folio); spin_unlock_irqrestore(&q->lock, flags); } /* * A choice of three behaviors for folio_wait_bit_common(): */ enum behavior { EXCLUSIVE, /* Hold ref to page and take the bit when woken, like * __folio_lock() waiting on then setting PG_locked. */ SHARED, /* Hold ref to page and check the bit when woken, like * folio_wait_writeback() waiting on PG_writeback. */ DROP, /* Drop ref to page before wait, no check when woken, * like folio_put_wait_locked() on PG_locked. */ }; /* * Attempt to check (or get) the folio flag, and mark us done * if successful. */ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { if (test_and_set_bit(bit_nr, &folio->flags)) return false; } else if (test_bit(bit_nr, &folio->flags)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, int state, enum behavior behavior) { wait_queue_head_t *q = folio_waitqueue(folio); int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; if (bit_nr == PG_locked && !folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = bit_nr; repeat: wait->flags = 0; if (behavior == EXCLUSIVE) { wait->flags = WQ_FLAG_EXCLUSIVE; if (--unfairness < 0) wait->flags |= WQ_FLAG_CUSTOM; } /* * Do one last check whether we can get the * page bit synchronously. * * Do the folio_set_waiters() marking before that * to let any waker we _just_ missed know they * need to wake us up (otherwise they'll never * even go to the slow case that looks at the * page queue), and add ourselves to the wait * queue if we need to sleep. * * This part needs to be done under the queue * lock to avoid races. */ spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, bit_nr, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * From now on, all the logic will be based on * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to * see whether the page bit testing has already * been done by the wake function. * * We can drop our reference to the folio. */ if (behavior == DROP) folio_put(folio); /* * Note that until the "finish_wait()", or until * we see the WQ_FLAG_WOKEN flag, we need to * be very careful with the 'wait->flags', because * we may race with a waker that sets them. */ for (;;) { unsigned int flags; set_current_state(state); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(state, current)) break; io_schedule(); continue; } /* If we were non-exclusive, we're done */ if (behavior != EXCLUSIVE) break; /* If the waker got the lock for us, we're done */ if (flags & WQ_FLAG_DONE) break; /* * Otherwise, if we're getting the lock, we need to * try to get it ourselves. * * And if that fails, we'll have to retry this all. */ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) goto repeat; wait->flags |= WQ_FLAG_DONE; break; } /* * If a signal happened, this 'finish_wait()' may remove the last * waiter from the wait-queues, but the folio waiters bit will remain * set. That's ok. The next wakeup will take care of it, and trying * to do it here would be difficult and prone to races. */ finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } /* * NOTE! The wait->flags weren't stable until we've done the * 'finish_wait()', and we could have exited the loop above due * to a signal, and had a wakeup event happen after the signal * test but before the 'finish_wait()'. * * So only after the finish_wait() can we reliably determine * if we got woken up or not, so we can now figure out the final * return value based on that state without races. * * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive * waiter, but an exclusive one requires WQ_FLAG_DONE. */ if (behavior == EXCLUSIVE) return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } #ifdef CONFIG_MIGRATION /** * migration_entry_wait_on_locked - Wait for a migration entry to be removed * @entry: migration swap entry. * @ptl: already locked ptl. This function will drop the lock. * * Wait for a migration entry referencing the given page to be removed. This is * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this * should be called while holding the ptl for the migration entry referencing * the page. * * Returns after unlocking the ptl. * * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { delayacct_thrashing_start(&in_thrashing); psi_memstall_enter(&pflags); thrashing = true; } init_wait(wait); wait->func = wake_page_function; wait_page.folio = folio; wait_page.bit_nr = PG_locked; wait->flags = 0; spin_lock_irq(&q->lock); folio_set_waiters(folio); if (!folio_trylock_flag(folio, PG_locked, wait)) __add_wait_queue_entry_tail(q, wait); spin_unlock_irq(&q->lock); /* * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. */ spin_unlock(ptl); for (;;) { unsigned int flags; set_current_state(TASK_UNINTERRUPTIBLE); /* Loop until we've been woken or interrupted */ flags = smp_load_acquire(&wait->flags); if (!(flags & WQ_FLAG_WOKEN)) { if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) break; io_schedule(); continue; } break; } finish_wait(q, wait); if (thrashing) { delayacct_thrashing_end(&in_thrashing); psi_memstall_leave(&pflags); } } #endif void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit); int folio_wait_bit_killable(struct folio *folio, int bit_nr) { return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); } EXPORT_SYMBOL(folio_wait_bit_killable); /** * folio_put_wait_locked - Drop a reference and wait for it to be unlocked * @folio: The folio to wait for. * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @folio. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the folio to * come unlocked. After this function returns, the caller should not * dereference @folio. * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } /** * folio_unlock - Unlock a locked folio. * @folio: The folio. * * Unlocks the folio and wakes up any thread sleeping on the page lock. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_unlock(struct folio *folio) { /* Bit 7 allows x86 to check the byte's sign bit */ BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); /** * folio_end_read - End read on a folio. * @folio: The folio. * @success: True if all reads completed successfully. * * When all reads against a folio have completed, filesystems should * call this function to let the pagecache know that no more reads * are outstanding. This will unlock the folio and wake up any thread * sleeping on the lock. The folio will also be marked uptodate if all * reads succeeded. * * Context: May be called from interrupt or process context. May not be * called from NMI context. */ void folio_end_read(struct folio *folio, bool success) { unsigned long mask = 1 << PG_locked; /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; if (folio_xor_flags_has_waiters(folio, mask)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. * * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for * it. The folio reference held for PG_private_2 being set is released. * * This is, for example, used when a netfs folio is being written to a local * disk cache, thereby allowing writes to the cache for the same folio to be * serialised. */ void folio_end_private_2(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); folio_wake_bit(folio, PG_private_2); folio_put(folio); } EXPORT_SYMBOL(folio_end_private_2); /** * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio. */ void folio_wait_private_2(struct folio *folio) { while (folio_test_private_2(folio)) folio_wait_bit(folio, PG_private_2); } EXPORT_SYMBOL(folio_wait_private_2); /** * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. * @folio: The folio to wait on. * * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is * received by the calling task. * * Return: * - 0 if successful. * - -EINTR if a fatal signal was encountered. */ int folio_wait_private_2_killable(struct folio *folio) { int ret = 0; while (folio_test_private_2(folio)) { ret = folio_wait_bit_killable(folio, PG_private_2); if (ret < 0) break; } return ret; } EXPORT_SYMBOL(folio_wait_private_2_killable); /* * If folio was marked as dropbehind, then pages should be dropped when writeback * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ static void folio_end_dropbehind_write(struct folio *folio) { /* * Hitting !in_task() should not happen off RWF_DONTCACHE writeback, * but can happen if normal writeback just happens to find dirty folios * that were created as part of uncached writeback, and that writeback * would otherwise not need non-IRQ handling. Just skip the * invalidation in that case. */ if (in_task() && folio_trylock(folio)) { if (folio->mapping) folio_unmap_invalidate(folio->mapping, folio, 0); folio_unlock(folio); } } /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. * * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { bool folio_dropbehind = false; VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing * to shuffle a folio marked for immediate reclaim is too mild * a gain to justify taking an atomic operation penalty at the * end of every folio writeback. */ if (folio_test_reclaim(folio)) { folio_clear_reclaim(folio); folio_rotate_reclaimable(folio); } /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and * reused before the folio_wake_bit(). */ folio_get(folio); if (!folio_test_dirty(folio)) folio_dropbehind = folio_test_clear_dropbehind(folio); if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); if (folio_dropbehind) folio_end_dropbehind_write(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); /** * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. * @folio: The folio to lock */ void __folio_lock(struct folio *folio) { folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, EXCLUSIVE); } EXPORT_SYMBOL(__folio_lock); int __folio_lock_killable(struct folio *folio) { return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, EXCLUSIVE); } EXPORT_SYMBOL_GPL(__folio_lock_killable); static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) { struct wait_queue_head *q = folio_waitqueue(folio); int ret; wait->folio = folio; wait->bit_nr = PG_locked; spin_lock_irq(&q->lock); __add_wait_queue_entry_tail(q, &wait->wait); folio_set_waiters(folio); ret = !folio_trylock(folio); /* * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. */ if (!ret) __remove_wait_queue(q, &wait->wait); else ret = -EIOCBQUEUED; spin_unlock_irq(&q->lock); return ret; } /* * Return values: * 0 - folio is locked. * non-zero - folio is not locked. * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock/per-VMA lock is not * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; ret = __folio_lock_killable(folio); if (ret) { release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { __folio_lock(folio); } return 0; } /** * page_cache_next_miss() - Find the next gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the * gap with the lowest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 5, then subsequently a gap is * created at index 10, page_cache_next_miss covering both indices may * return 10 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'return - index >= max_scan' will be true). * In the rare case of index wrap-around, 0 will be returned. */ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_next(&xas); if (!entry || xa_is_value(entry)) return xas.xa_index; if (xas.xa_index == 0) return 0; } return index + max_scan; } EXPORT_SYMBOL(page_cache_next_miss); /** * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. * * Search the range [max(index - max_scan + 1, 0), index] for the * gap with the highest index. * * This function may be called under the rcu_read_lock. However, this will * not atomically search a snapshot of the cache at a single point in time. * For example, if a gap is created at index 10, then subsequently a gap is * created at index 5, page_cache_prev_miss() covering both indices may * return 5 if called under the rcu_read_lock. * * Return: The index of the gap if found, otherwise an index outside the * range specified (in which case 'index - return >= max_scan' will be true). * In the rare case of wrap-around, ULONG_MAX will be returned. */ pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { XA_STATE(xas, &mapping->i_pages, index); while (max_scan--) { void *entry = xas_prev(&xas); if (!entry || xa_is_value(entry)) break; if (xas.xa_index == ULONG_MAX) break; } return xas.xa_index; } EXPORT_SYMBOL(page_cache_prev_miss); /* * Lockless page cache protocol: * On the lookup side: * 1. Load the folio from i_pages * 2. Increment the refcount if it's not zero * 3. If the folio is not found by xas_reload(), put the refcount and retry * * On the removal side: * A. Freeze the page (by zeroing the refcount if nobody else has a reference) * B. Remove the page from i_pages * C. Return the page to the page allocator * * This means that any page may have its reference count temporarily * increased by a speculative page cache (or GUP-fast) lookup as it can * be allocated by another user before the RCU grace period expires. * Because the refcount temporarily acquired here may end up being the * last refcount on the page, any page allocation must be freeable by * folio_put(). */ /* * filemap_get_entry - Get a page cache entry. * @mapping: the address_space to search * @index: The page cache index. * * Looks up the page cache entry at @mapping & @index. If it is a folio, * it is returned with an increased refcount. If it is a shadow entry * of a previously evicted folio, or a swap entry from shmem/tmpfs, * it is returned without further action. * * Return: The folio, swap or shadow entry, %NULL if nothing is found. */ void *filemap_get_entry(struct address_space *mapping, pgoff_t index) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); repeat: xas_reset(&xas); folio = xas_load(&xas); if (xas_retry(&xas, folio)) goto repeat; /* * A shadow entry of a recently evicted page, or a swap entry from * shmem/tmpfs. Return it without attempting to raise page count. */ if (!folio || xa_is_value(folio)) goto out; if (!folio_try_get(folio)) goto repeat; if (unlikely(folio != xas_reload(&xas))) { folio_put(folio); goto repeat; } out: rcu_read_unlock(); return folio; } /** * __filemap_get_folio - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. * * Looks up the page cache entry at @mapping & @index. * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. * * If this function returns a folio, it is returned with an increased refcount. * * Return: The found folio or an ERR_PTR() otherwise. */ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp) { struct folio *folio; repeat: folio = filemap_get_entry(mapping, index); if (xa_is_value(folio)) folio = NULL; if (!folio) goto no_page; if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!folio_trylock(folio)) { folio_put(folio); return ERR_PTR(-EAGAIN); } } else { folio_lock(folio); } /* Has the page been truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); } if (fgp_flags & FGP_ACCESSED) folio_mark_accessed(folio); else if (fgp_flags & FGP_WRITE) { /* Clear idle flag for buffer write */ if (folio_test_idle(folio)) folio_clear_idle(folio); } if (fgp_flags & FGP_STABLE) folio_wait_stable(folio); no_page: if (!folio && (fgp_flags & FGP_CREAT)) { unsigned int min_order = mapping_min_folio_order(mapping); unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); int err; index = mapping_align_index(mapping, index); if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; gfp |= GFP_NOWAIT | __GFP_NOWARN; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; if (order > mapping_max_folio_order(mapping)) order = mapping_max_folio_order(mapping); /* If we're not aligned, allocate a smaller folio */ if (index & ((1UL << order) - 1)) order = __ffs(index); do { gfp_t alloc_gfp = gfp; err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); if (!folio) continue; /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) __folio_set_referenced(folio); if (fgp_flags & FGP_DONTCACHE) __folio_set_dropbehind(folio); err = filemap_add_folio(mapping, folio, index, gfp); if (!err) break; folio_put(folio); folio = NULL; } while (order-- > min_order); if (err == -EEXIST) goto repeat; if (err) { /* * When NOWAIT I/O fails to allocate folios this could * be due to a nonblocking memory allocation and not * because the system actually is out of memory. * Return -EAGAIN so that there caller retries in a * blocking fashion instead of propagating -ENOMEM * to the application. */ if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM) err = -EAGAIN; return ERR_PTR(err); } /* * filemap_add_folio locks the page, and for mmap * we expect an unlocked page. */ if (folio && (fgp_flags & FGP_FOR_MMAP)) folio_unlock(folio); } if (!folio) return ERR_PTR(-ENOENT); /* not an uncached lookup, clear uncached if set */ if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) folio_clear_dropbehind(folio); return folio; } EXPORT_SYMBOL(__filemap_get_folio); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) { struct folio *folio; retry: if (mark == XA_PRESENT) folio = xas_find(xas, max); else folio = xas_find_marked(xas, max, mark); if (xas_retry(xas, folio)) goto retry; /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it * without attempting to raise page count. */ if (!folio || xa_is_value(folio)) return folio; if (!folio_try_get(folio)) goto reset; if (unlikely(folio != xas_reload(xas))) { folio_put(folio); goto reset; } return folio; reset: xas_reset(xas); goto retry; } /** * find_get_entries - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page cache index * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices corresponding to the entries in @entries * * find_get_entries() will search for and return a batch of entries in * the mapping. The entries are placed in @fbatch. find_get_entries() * takes a reference on any actual folios it returns. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries or large folios. * * Any shadow entries of evicted folios, or swap entries from * shmem/tmpfs, are included in the returned array. * * Return: The number of entries which were found. */ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; } if (folio_batch_count(fbatch)) { unsigned long nr; int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; if (!xa_is_value(folio)) nr = folio_nr_pages(folio); else nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]); *start = round_down(indices[idx] + nr, nr); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * find_lock_entries - Find a batch of pagecache entries. * @mapping: The address_space to search. * @start: The starting page cache index. * @end: The final page index (inclusive). * @fbatch: Where the resulting entries are placed. * @indices: The cache indices of the entries in @fbatch. * * find_lock_entries() will return a batch of entries from @mapping. * Swap, shadow and DAX entries are included. Folios are returned * locked and with an incremented refcount. Folios which are locked * by somebody else or under writeback are skipped. Folios which are * partially outside the range are not returned. * * The entries have ascending indexes. The indices may not be consecutive * due to not-present entries, large folios, folios which could not be * locked or folios under writeback. * * Return: The number of entries which were found. */ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { unsigned long base; unsigned long nr; if (!xa_is_value(folio)) { nr = folio_nr_pages(folio); base = folio->index; /* Omit large folio which begins before the start */ if (base < *start) goto put; /* Omit large folio which extends beyond the end */ if (base + nr - 1 > end) goto put; if (!folio_trylock(folio)) goto put; if (folio->mapping != mapping || folio_test_writeback(folio)) goto unlock; VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), folio); } else { nr = 1 << xas_get_order(&xas); base = xas.xa_index & ~(nr - 1); /* Omit order>0 value which begins before the start */ if (base < *start) continue; /* Omit order>0 value which extends beyond the end */ if (base + nr - 1 > end) break; } /* Update start now so that last update is correct on return */ *start = base + nr; indices[fbatch->nr] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; continue; unlock: folio_unlock(folio); put: folio_put(folio); } rcu_read_unlock(); return folio_batch_count(fbatch); } /** * filemap_get_folios - Get a batch of folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill. * * Search for and return a batch of folios in the mapping starting at * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @fbatch: The batch to fill * * filemap_get_folios_contig() works exactly like filemap_get_folios(), * except the returned folios are guaranteed to be contiguous. This may * not return all contiguous folios if the batch gets filled up. * * Return: The number of folios found. * Also update @start to be positioned for traversal of the next folio. */ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); unsigned long nr; struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio && xas.xa_index <= end; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; /* * If the entry has been swapped out, we can stop looking. * No current caller is looking for DAX entries. */ if (xa_is_value(folio)) goto update_start; /* If we landed in the middle of a THP, continue at its end. */ if (xa_is_sibling(folio)) goto update_start; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } update_start: nr = folio_batch_count(fbatch); if (nr) { folio = fbatch->folios[nr - 1]; *start = folio_next_index(folio); } out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_contig); /** * filemap_get_folios_tag - Get a batch of folios matching @tag * @mapping: The address_space to search * @start: The starting page index * @end: The final page index (inclusive) * @tag: The tag index * @fbatch: The batch to fill * * The first folio may start before @start; if it does, it will contain * @start. The final folio may extend beyond @end; if it does, it will * contain @end. The folios have ascending indices. There may be gaps * between the folios if there are indices which have no folio in the * page cache. If folios are added to or removed from the page cache * while this is running, they may or may not be found by this call. * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. */ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, tag)) != NULL) { /* * Shadow entries should never be tagged, but this iteration * is lockless so there is a window for page reclaim to evict * a page we saw tagged. Skip over it. */ if (xa_is_value(folio)) continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); *start = folio->index + nr; goto out; } } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This * breaks the iteration when there is a page at index -1 but that is * already broke anyway. */ if (end == (pgoff_t)-1) *start = (pgoff_t)-1; else *start = end + 1; out: rcu_read_unlock(); return folio_batch_count(fbatch); } EXPORT_SYMBOL(filemap_get_folios_tag); /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: * * ---R__________________________________________B__________ * ^ reading here ^ bad block(assume 4k) * * read(R) => miss => readahead(R...B) => media error => frustrating retries * => failing the whole request => read(R) => read(R+1) => * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... * * It is going insane. Fix it by quickly scaling down the readahead size. */ static void shrink_readahead_size_eio(struct file_ra_state *ra) { ra->ra_pages /= 4; } /* * filemap_get_read_batch - Get a batch of folios for read * * Get a batch of folios which represent a contiguous range of bytes in * the file. No exceptional entries will be returned. If @index is in * the middle of a folio, the entire folio will be returned. The last * folio in the batch may have the readahead flag set or the uptodate flag * clear so that the caller can take the appropriate action. */ static void filemap_get_read_batch(struct address_space *mapping, pgoff_t index, pgoff_t max, struct folio_batch *fbatch) { XA_STATE(xas, &mapping->i_pages, index); struct folio *folio; rcu_read_lock(); for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { if (xas_retry(&xas, folio)) continue; if (xas.xa_index > max || xa_is_value(folio)) break; if (xa_is_sibling(folio)) break; if (!folio_try_get(folio)) goto retry; if (unlikely(folio != xas_reload(&xas))) goto put_folio; if (!folio_batch_add(fbatch, folio)) break; if (!folio_test_uptodate(folio)) break; if (folio_test_readahead(folio)) break; xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); retry: xas_reset(&xas); } rcu_read_unlock(); } static int filemap_read_folio(struct file *file, filler_t filler, struct folio *folio) { bool workingset = folio_test_workingset(folio); unsigned long pflags; int error; /* Start the actual read. The read will unlock the page. */ if (unlikely(workingset)) psi_memstall_enter(&pflags); error = filler(file, folio); if (unlikely(workingset)) psi_memstall_leave(&pflags); if (error) return error; error = folio_wait_locked_killable(folio); if (error) return error; if (folio_test_uptodate(folio)) return 0; if (file) shrink_readahead_size_eio(&file->f_ra); return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, loff_t pos, size_t count, struct folio *folio, bool need_uptodate) { if (folio_test_uptodate(folio)) return true; /* pipes can't handle partially uptodate pages */ if (need_uptodate) return false; if (!mapping->a_ops->is_partially_uptodate) return false; if (mapping->host->i_blkbits >= folio_shift(folio)) return false; if (folio_pos(folio) > pos) { count -= folio_pos(folio) - pos; pos = 0; } else { pos -= folio_pos(folio); } return mapping->a_ops->is_partially_uptodate(folio, pos, count); } static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, size_t count, struct folio *folio, bool need_uptodate) { int error; if (iocb->ki_flags & IOCB_NOWAIT) { if (!filemap_invalidate_trylock_shared(mapping)) return -EAGAIN; } else { filemap_invalidate_lock_shared(mapping); } if (!folio_trylock(folio)) { error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto unlock_mapping; if (!(iocb->ki_flags & IOCB_WAITQ)) { filemap_invalidate_unlock_shared(mapping); /* * This is where we usually end up waiting for a * previously submitted readahead to finish. */ folio_put_wait_locked(folio, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } error = __folio_lock_async(folio, iocb->ki_waitq); if (error) goto unlock_mapping; } error = AOP_TRUNCATED_PAGE; if (!folio->mapping) goto unlock; error = 0; if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio, need_uptodate)) goto unlock; error = -EAGAIN; if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) goto unlock; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); goto unlock_mapping; unlock: folio_unlock(folio); unlock_mapping: filemap_invalidate_unlock_shared(mapping); if (error == AOP_TRUNCATED_PAGE) folio_put(folio); return error; } static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) __folio_set_dropbehind(folio); /* * Protect against truncate / hole punch. Grabbing invalidate_lock * here assures we cannot instantiate and bring uptodate new * pagecache folios after evicting page cache during truncate * and before actually freeing blocks. Note that we could * release invalidate_lock after inserting the folio into * the page cache as the locked folio would then be enough to * synchronize with hole punching. But there are code paths * such as filemap_update_page() filling in partially uptodate * pages or ->readahead() that need to hold invalidate_lock * while mapping blocks for IO so let's hold the lock here as * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) error = AOP_TRUNCATED_PAGE; if (error) goto error; error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, folio); if (error) goto error; filemap_invalidate_unlock_shared(mapping); folio_batch_add(fbatch, folio); return 0; error: filemap_invalidate_unlock_shared(mapping); folio_put(folio); return error; } static int filemap_readahead(struct kiocb *iocb, struct file *file, struct address_space *mapping, struct folio *folio, pgoff_t last_index) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_async_ra(&ractl, folio, last_index - folio->index); return 0; } static int filemap_get_pages(struct kiocb *iocb, size_t count, struct folio_batch *fbatch, bool need_uptodate) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; unsigned int flags; int err = 0; /* "last_index" is the index of the page beyond the end of the read */ last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); if (iocb->ki_flags & IOCB_DONTCACHE) ractl.dropbehind = 1; page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } folio = fbatch->folios[folio_batch_count(fbatch) - 1]; if (folio_test_readahead(folio)) { err = filemap_readahead(iocb, filp, mapping, folio, last_index); if (err) goto err; } if (!folio_test_uptodate(folio)) { if ((iocb->ki_flags & IOCB_WAITQ) && folio_batch_count(fbatch) > 1) iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) goto err; } trace_mm_filemap_get_pages(mapping, index, last_index - 1); return 0; err: if (err < 0) folio_put(folio); if (likely(--fbatch->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) goto retry; return err; } static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) { unsigned int shift = folio_shift(folio); return (pos1 >> shift == pos2 >> shift); } static void filemap_end_dropbehind_read(struct address_space *mapping, struct folio *folio) { if (!folio_test_dropbehind(folio)) return; if (folio_test_writeback(folio) || folio_test_dirty(folio)) return; if (folio_trylock(folio)) { if (folio_test_clear_dropbehind(folio)) folio_unmap_invalidate(mapping, folio, 0); folio_unlock(folio); } } /** * filemap_read - Read data from the page cache. * @iocb: The iocb to read. * @iter: Destination for the data. * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns * a negative error number. */ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct folio_batch fbatch; int i, error = 0; bool writably_mapped; loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos < 0)) return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { cond_resched(); /* * If we've already successfully copied some data, then we * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; if (unlikely(iocb->ki_pos >= i_size_read(inode))) break; error = filemap_get_pages(iocb, iter->count, &fbatch, false); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(mapping); /* * When a read accesses the same folio several times, only * mark it as accessed the first time. */ if (!pos_same_folio(iocb->ki_pos, last_pos - 1, fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t fsize = folio_size(folio); size_t offset = iocb->ki_pos & (fsize - 1); size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); size_t copied; if (end_offset < folio_pos(folio)) break; if (i > 0) folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); copied = copy_folio_to_iter(folio, offset, bytes, iter); already_read += copied; iocb->ki_pos += copied; last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; break; } } put_folios: for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; filemap_end_dropbehind_read(mapping, folio); folio_put(folio); } folio_batch_init(&fbatch); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); int kiocb_write_and_wait(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos; loff_t end = pos + count - 1; if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, pos, end)) return -EAGAIN; return 0; } return filemap_write_and_wait_range(mapping, pos, end); } EXPORT_SYMBOL_GPL(kiocb_write_and_wait); int filemap_invalidate_pages(struct address_space *mapping, loff_t pos, loff_t end, bool nowait) { int ret; if (nowait) { /* we could block if there are any pages in the range */ if (filemap_range_has_page(mapping, pos, end)) return -EAGAIN; } else { ret = filemap_write_and_wait_range(mapping, pos, end); if (ret) return ret; } /* * After a write we want buffered reads to be sure to go to disk to get * the new data. We invalidate clean cached page from the region we're * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); } int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; return filemap_invalidate_pages(mapping, iocb->ki_pos, iocb->ki_pos + count - 1, iocb->ki_flags & IOCB_NOWAIT); } EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); /** * generic_file_read_iter - generic filesystem read routine * @iocb: kernel I/O control block * @iter: destination for the data read * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. * * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall * be returned when no data can be read without waiting for I/O requests * to complete; it doesn't prevent readahead. * * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O * requests shall be made for the read or for readahead. When no data * can be read, -EAGAIN shall be returned. When readahead would be * triggered, a partial, possibly empty read shall be returned. * * Return: * * number of bytes copied, even for partial reads * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; if (!count) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; retval = kiocb_write_and_wait(iocb, count); if (retval < 0) return retval; file_accessed(file); retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; count -= retval; } if (retval != -EIOCBQUEUED) iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter * compressed extents, so if there was an error, or if * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ if (retval < 0 || !count || IS_DAX(inode)) return retval; if (iocb->ki_pos >= i_size_read(inode)) return retval; } return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); /* * Splice subpages from a folio into a pipe. */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size) { struct page *page; size_t spliced = 0, offset = offset_in_folio(folio, fpos); page = folio_page(folio, offset / PAGE_SIZE); size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); *buf = (struct pipe_buffer) { .ops = &page_cache_pipe_buf_ops, .page = page, .offset = offset, .len = part, }; folio_get(folio); pipe->head++; page++; spliced += part; offset = 0; } return spliced; } /** * filemap_splice_read - Splice data from a file's pagecache into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function gets folios from a file's pagecache and splices them into the * pipe. Readahead will be called as necessary to fill more folios. This may * be used for blockdevs also. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct folio_batch fbatch; struct kiocb iocb; size_t total_spliced = 0, used, npages; loff_t isize, end_offset; bool writably_mapped; int i, error = 0; if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes)) return 0; init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); folio_batch_init(&fbatch); do { cond_resched(); if (*ppos >= i_size_read(in->f_mapping->host)) break; iocb.ki_pos = *ppos; error = filemap_get_pages(&iocb, len, &fbatch, true); if (error < 0) break; /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) break; end_offset = min_t(loff_t, isize, *ppos + len); /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: */ writably_mapped = mapping_writably_mapped(in->f_mapping); for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; size_t n; if (folio_pos(folio) >= end_offset) goto out; folio_mark_accessed(folio); /* * If users can be writing to this folio using arbitrary * virtual addresses, take care of potential aliasing * before reading the folio on the kernel side. */ if (writably_mapped) flush_dcache_folio(folio); n = min_t(loff_t, len, isize - *ppos); n = splice_folio_into_pipe(pipe, folio, *ppos, n); if (!n) goto out; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) goto out; } folio_batch_release(&fbatch); } while (len); out: folio_batch_release(&fbatch); file_accessed(in); return total_spliced ? total_spliced : error; } EXPORT_SYMBOL(filemap_splice_read); static inline loff_t folio_seek_hole_data(struct xa_state *xas, struct address_space *mapping, struct folio *folio, loff_t start, loff_t end, bool seek_data) { const struct address_space_operations *ops = mapping->a_ops; size_t offset, bsz = i_blocksize(mapping->host); if (xa_is_value(folio) || folio_test_uptodate(folio)) return seek_data ? start : end; if (!ops->is_partially_uptodate) return seek_data ? end : start; xas_pause(xas); rcu_read_unlock(); folio_lock(folio); if (unlikely(folio->mapping != mapping)) goto unlock; offset = offset_in_folio(folio, start) & ~(bsz - 1); do { if (ops->is_partially_uptodate(folio, offset, bsz) == seek_data) break; start = (start + bsz) & ~((u64)bsz - 1); offset += bsz; } while (offset < folio_size(folio)); unlock: folio_unlock(folio); rcu_read_lock(); return start; } static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) { if (xa_is_value(folio)) return PAGE_SIZE << xas_get_order(xas); return folio_size(folio); } /** * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. * @mapping: Address space to search. * @start: First byte to consider. * @end: Limit of search (exclusive). * @whence: Either SEEK_HOLE or SEEK_DATA. * * If the page cache knows which blocks contain holes and which blocks * contain data, your filesystem can use this function to implement * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. */ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, loff_t end, int whence) { XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); pgoff_t max = (end - 1) >> PAGE_SHIFT; bool seek_data = (whence == SEEK_DATA); struct folio *folio; if (end <= start) return -ENXIO; rcu_read_lock(); while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; size_t seek_size; if (start < pos) { if (!seek_data) goto unlock; start = pos; } seek_size = seek_folio_size(&xas, folio); pos = round_up((u64)pos + 1, seek_size); start = folio_seek_hole_data(&xas, mapping, folio, start, pos, seek_data); if (start < pos) goto unlock; if (start >= end) break; if (seek_size > PAGE_SIZE) xas_set(&xas, pos >> PAGE_SHIFT); if (!xa_is_value(folio)) folio_put(folio); } if (seek_data) start = -ENXIO; unlock: rcu_read_unlock(); if (folio && !xa_is_value(folio)) folio_put(folio); if (start > end) return end; return start; } #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) /* * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock * @vmf - the vm_fault for this fault. * @folio - the folio to lock. * @fpin - the pointer to the file we may pin (or is already pinned). * * This works similar to lock_folio_or_retry in that it can drop the * mmap_lock. It differs in that it actually returns the folio locked * if it returns 1 and 0 if it couldn't lock the folio. If we did have * to drop the mmap_lock then fpin will point to the pinned file and * needs to be fput()'ed at a later point. */ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, struct file **fpin) { if (folio_trylock(folio)) return 1; /* * NOTE! This will make us return with VM_FAULT_RETRY, but with * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) return 0; *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* * We didn't have the right flags to drop the * fault lock, but all fault_handlers only check * for fatal signals if we return VM_FAULT_RETRY, * so we need to drop the fault lock here and * return 0 if we don't have a fpin. */ if (*fpin == NULL) release_fault_lock(vmf); return 0; } } else __folio_lock(folio); return 1; } /* * Synchronous readahead happens when we don't even find a page in the page * cache at all. We don't want to perform IO under the mmap sem, so if we have * to drop the mmap sem we return the file that was pinned in order for us to do * that. If we didn't pin a file then we return NULL. The file that is * returned needs to be fput()'ed when we're done with it. */ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; unsigned long vm_flags = vmf->vma->vm_flags; unsigned int mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; /* * Fetch two PMD folios, so we get the chance to actually * readahead, unless we've been told not to. */ if (!(vm_flags & VM_RAND_READ)) ra->size *= 2; ra->async_size = HPAGE_PMD_NR; page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); return fpin; } #endif /* If we don't want any read-ahead, don't bother */ if (vm_flags & VM_RAND_READ) return fpin; if (!ra->ra_pages) return fpin; if (vm_flags & VM_SEQ_READ) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_sync_ra(&ractl, ra->ra_pages); return fpin; } /* Avoid banging the cache line if not needed */ mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss < MMAP_LOTSAMISS * 10) WRITE_ONCE(ra->mmap_miss, ++mmap_miss); /* * Do we miss much more than hit in this file? If so, * stop bothering with read-ahead. It will only hurt. */ if (mmap_miss > MMAP_LOTSAMISS) return fpin; /* * mmap read-around */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); ra->size = ra->ra_pages; ra->async_size = ra->ra_pages / 4; ractl._index = ra->start; page_cache_ra_order(&ractl, ra, 0); return fpin; } /* * Asynchronous readahead happens when we find the page and PG_readahead, * so we want to possibly extend the readahead further. We return the file that * was pinned if we have to drop the mmap_lock in order to do IO. */ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, struct folio *folio) { struct file *file = vmf->vma->vm_file; struct file_ra_state *ra = &file->f_ra; DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); struct file *fpin = NULL; unsigned int mmap_miss; /* If we don't want any read-ahead, don't bother */ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; mmap_miss = READ_ONCE(ra->mmap_miss); if (mmap_miss) WRITE_ONCE(ra->mmap_miss, --mmap_miss); if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); page_cache_async_ra(&ractl, folio, ra->ra_pages); } return fpin; } static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; pte_t *ptep; /* * We might have COW'ed a pagecache folio and might now have an mlocked * anon folio mapped. The original pagecache folio is not mlocked and * might have been evicted. During a read+clear/modify/write update of * the PTE, such as done in do_numa_page()/change_pte_range(), we * temporarily clear the PTE under PT lock and might detect it here as * "none" when not holding the PT lock. * * Not rechecking the PTE under PT lock could result in an unexpected * major fault in an mlock'ed region. Recheck only for this special * scenario while holding the PT lock, to not degrade non-mlocked * scenarios. Recheck the PTE without PT lock firstly, thereby reducing * the number of times we hold PT lock. */ if (!(vma->vm_flags & VM_LOCKED)) return 0; if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { ret = VM_FAULT_NOPAGE; } else { spin_lock(vmf->ptl); if (unlikely(!pte_none(ptep_get(ptep)))) ret = VM_FAULT_NOPAGE; spin_unlock(vmf->ptl); } pte_unmap(ptep); return ret; } /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault * * filemap_fault() is invoked via the vma operations vector for a * mapped memory region to read in file data during a page fault. * * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. * * vma->vm_mm->mmap_lock must be held on entry. * * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). * * If our return value does not have VM_FAULT_RETRY set, the mmap_lock * has not been released. * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. * * Return: bitwise-OR of %VM_FAULT_ codes. */ vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; struct file *fpin = NULL; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; pgoff_t max_idx, index = vmf->pgoff; struct folio *folio; vm_fault_t ret = 0; bool mapping_locked = false; max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) return VM_FAULT_SIGBUS; trace_mm_filemap_fault(mapping, index); /* * Do we have something in the page cache already? */ folio = filemap_get_folio(mapping, index); if (likely(!IS_ERR(folio))) { /* * We found the page, so try async readahead before waiting for * the lock. */ if (!(vmf->flags & FAULT_FLAG_TRIED)) fpin = do_async_mmap_readahead(vmf, folio); if (unlikely(!folio_test_uptodate(folio))) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } } else { ret = filemap_fault_recheck_pte_none(vmf); if (unlikely(ret)) return ret; /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; fpin = do_sync_mmap_readahead(vmf); retry_find: /* * See comment in filemap_create_folio() why we need * invalidate_lock */ if (!mapping_locked) { filemap_invalidate_lock_shared(mapping); mapping_locked = true; } folio = __filemap_get_folio(mapping, index, FGP_CREAT|FGP_FOR_MMAP, vmf->gfp_mask); if (IS_ERR(folio)) { if (fpin) goto out_retry; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_OOM; } } if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) goto out_retry; /* Did it get truncated? */ if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); folio_put(folio); goto retry_find; } VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* * We have a locked folio in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error, * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* * If the invalidate lock is not held, the folio was in cache * and uptodate and now it is not. Strange but possible since we * didn't hold the page lock all the time. Let's drop * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } /* * OK, the folio is really not uptodate. This can be because the * VMA has the VM_RAND_READ flag set, or because an error * arose. Let's read it in directly. */ goto page_not_uptodate; } /* * We've made it this far and we had to drop our mmap_lock, now is the * time to return to the upper layer and have it re-find the vma and * redo the fault. */ if (fpin) { folio_unlock(folio); goto out_retry; } if (mapping_locked) filemap_invalidate_unlock_shared(mapping); /* * Found the page and have a reference on it. * We must recheck i_size under page lock. */ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(index >= max_idx)) { folio_unlock(folio); folio_put(folio); return VM_FAULT_SIGBUS; } vmf->page = folio_file_page(folio, index); return ret | VM_FAULT_LOCKED; page_not_uptodate: /* * Umm, take care of errors if the page isn't up-to-date. * Try to re-read it _once_. We do this synchronously, * because there really aren't any performance issues here * and we need to check for errors. */ fpin = maybe_unlock_mmap_for_io(vmf, fpin); error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); if (fpin) goto out_retry; folio_put(folio); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; filemap_invalidate_unlock_shared(mapping); return VM_FAULT_SIGBUS; out_retry: /* * We dropped the mmap_lock, we need to return to the fault handler to * re-find the vma and come back and find our hopefully still populated * page. */ if (!IS_ERR(folio)) folio_put(folio); if (mapping_locked) filemap_invalidate_unlock_shared(mapping); if (fpin) fput(fpin); return ret | VM_FAULT_RETRY; } EXPORT_SYMBOL(filemap_fault); static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, pgoff_t start) { struct mm_struct *mm = vmf->vma->vm_mm; /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) { folio_unlock(folio); folio_put(folio); return true; } if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) { struct page *page = folio_file_page(folio, start); vm_fault_t ret = do_set_pmd(vmf, page); if (!ret) { /* The page is mapped successfully, reference consumed. */ folio_unlock(folio); return true; } } if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); return false; } static struct folio *next_uptodate_folio(struct xa_state *xas, struct address_space *mapping, pgoff_t end_pgoff) { struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { if (!folio) return NULL; if (xas_retry(xas, folio)) continue; if (xa_is_value(folio)) continue; if (!folio_try_get(folio)) continue; if (folio_test_locked(folio)) goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) goto skip; if (!folio_trylock(folio)) goto skip; if (folio->mapping != mapping) goto unlock; if (!folio_test_uptodate(folio)) goto unlock; max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (xas->xa_index >= max_idx) goto unlock; return folio; unlock: folio_unlock(folio); skip: folio_put(folio); } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); return NULL; } /* * Map page range [start_page, start_page + nr_pages) of folio. * start_page is gotten from start by folio_page(folio, start) */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; do { if (PageHWPoison(page + count)) goto skip; /* * If there are too many folios that are recently evicted * in a file, they will probably continue to be evicted. * In such situation, read-ahead is only a waste of IO. * Don't decrease mmap_miss in this scenario to make sure * we can stop read-ahead. */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ if (!pte_none(ptep_get(&vmf->pte[count]))) goto skip; count++; continue; skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } count++; page += count; vmf->pte += count; addr += count * PAGE_SIZE; count = 0; } while (--nr_pages > 0); if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; folio_ref_add(folio, count); if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; return ret; } static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct folio *folio, unsigned long addr, unsigned long *rss, unsigned int *mmap_miss) { vm_fault_t ret = 0; struct page *page = &folio->page; if (PageHWPoison(page)) return ret; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be * handled in the specific fault path, and it'll prohibit * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) return ret; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; folio_ref_inc(folio); return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct vm_area_struct *vma = vmf->vma; struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t file_end, last_pgoff = start_pgoff; unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; unsigned long rss = 0; unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; if (filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); if (!vmf->pte) { folio_unlock(folio); folio_put(folio); goto out; } file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; if (end_pgoff > file_end) end_pgoff = file_end; folio_type = mm_counter_file(folio); do { unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) ret |= filemap_map_order0_folio(vmf, folio, addr, &rss, &mmap_miss); else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, nr_pages, &rss, &mmap_miss); folio_unlock(folio); folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff); out: rcu_read_unlock(); mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); if (mmap_miss >= mmap_miss_saved) WRITE_ONCE(file->f_ra.mmap_miss, 0); else WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct folio *folio = page_folio(vmf->page); vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); folio_lock(folio); if (folio->mapping != mapping) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } /* * We mark the folio dirty already here so that when freeze is in * progress, we are guaranteed that writeback during freezing will * see the dirty folio and writeprotect it again. */ folio_mark_dirty(folio); folio_wait_stable(folio); out: sb_end_pagefault(mapping->host->i_sb); return ret; } const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; return 0; } /* * This is for filesystems which do not implement ->writepage. */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } #else vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } #endif /* CONFIG_MMU */ EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct folio *do_read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; if (!filler) filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping)); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); err = filemap_add_folio(mapping, folio, index, gfp); if (unlikely(err)) { folio_put(folio); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } goto filler; } if (folio_test_uptodate(folio)) goto out; if (!folio_trylock(folio)) { folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); goto repeat; } /* Folio was truncated from mapping */ if (!folio->mapping) { folio_unlock(folio); folio_put(folio); goto repeat; } /* Someone else locked and filled the page in a very small window */ if (folio_test_uptodate(folio)) { folio_unlock(folio); goto out; } filler: err = filemap_read_folio(file, filler, folio); if (err) { folio_put(folio); if (err == AOP_TRUNCATED_PAGE) goto repeat; return ERR_PTR(err); } out: folio_mark_accessed(folio); return folio; } /** * read_cache_folio - Read into page cache, fill it if needed. * @mapping: The address_space to read from. * @index: The index to read. * @filler: Function to perform the read, or NULL to use aops->read_folio(). * @file: Passed to filler function, may be NULL if not required. * * Read one page into the page cache. If it succeeds, the folio returned * will contain @index, but it may not be the first page of the folio. * * If the filler function returns an error, it will be returned to the * caller. * * Context: May sleep. Expects mapping->invalidate_lock to be held. * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, filler_t filler, struct file *file) { return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); /** * mapping_read_folio_gfp - Read into page cache, using specified allocation flags. * @mapping: The address_space for the folio. * @index: The index that the allocated folio will contain. * @gfp: The page allocator flags to use if allocating. * * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with * any new memory allocations done using the specified allocation flags. * * The most likely error from this function is EIO, but ENOMEM is * possible and so is EINTR. If ->read_folio returns another error, * that will be returned to the caller. * * The function expects mapping->invalidate_lock to be already held. * * Return: Uptodate folio on success, ERR_PTR() on failure. */ struct folio *mapping_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_folio(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(mapping_read_folio_gfp); static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, struct file *file) { return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. * @mapping: the page's address_space * @index: the page index * @gfp: the page allocator flags to use if allocating * * This is the same as "read_mapping_page(mapping, index, NULL)", but with * any new page allocations done using the specified allocation flags. * * If the page does not get brought uptodate, return -EIO. * * The function expects mapping->invalidate_lock to be already held. * * Return: up to date page on success, ERR_PTR() on failure. */ struct page *read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { return do_read_cache_page(mapping, index, NULL, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); /* * Warn about a page cache invalidation failure during a direct I/O write. */ static void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; char *path; errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, current->comm); } } void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count) { struct address_space *mapping = iocb->ki_filp->f_mapping; if (mapping->nrpages && invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, (iocb->ki_pos + count - 1) >> PAGE_SHIFT)) dio_warn_stale_pagecache(iocb->ki_filp); } ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct address_space *mapping = iocb->ki_filp->f_mapping; size_t write_len = iov_iter_count(from); ssize_t written; /* * If a page can not be invalidated, return 0 to fall back * to buffered write. */ written = kiocb_invalidate_pages(iocb, write_len); if (written) { if (written == -EBUSY) return 0; return written; } written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been * cached by non-direct readahead, or faulted in by get_user_pages() * if the source of the write was an mmap'ed region of the file * we're writing. Either one is a pretty crazy thing to do, * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... * * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * * Noticeable example is a blkdev_direct_IO(). * * Skip invalidation for async writes or if mapping has no pages. */ if (written > 0) { struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; kiocb_invalidate_post_direct_write(iocb, written); pos += written; write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } if (written != -EIOCBQUEUED) iov_iter_revert(from, write_len - iov_iter_count(from)); return written; } EXPORT_SYMBOL(generic_file_direct_write); ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) { struct file *file = iocb->ki_filp; loff_t pos = iocb->ki_pos; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; size_t chunk = mapping_max_folio_size(mapping); long status = 0; ssize_t written = 0; do { struct folio *folio; size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ void *fsdata = NULL; bytes = iov_iter_count(i); retry: offset = pos & (chunk - 1); bytes = min(chunk - offset, bytes); balance_dirty_pages_ratelimited(mapping); if (fatal_signal_pending(current)) { status = -EINTR; break; } status = a_ops->write_begin(file, mapping, pos, bytes, &folio, &fsdata); if (unlikely(status < 0)) break; offset = offset_in_folio(folio, pos); if (bytes > folio_size(folio) - offset) bytes = folio_size(folio) - offset; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); /* * Faults here on mmap()s can recurse into arbitrary * filesystem code. Lots of locks are held that can * deadlock. Use an atomic copy to avoid deadlocking * in page fault handling. */ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); flush_dcache_folio(folio); status = a_ops->write_end(file, mapping, pos, bytes, copied, folio, fsdata); if (unlikely(status != copied)) { iov_iter_revert(i, copied - max(status, 0L)); if (unlikely(status < 0)) break; } cond_resched(); if (unlikely(status == 0)) { /* * A short copy made ->write_end() reject the * thing entirely. Might be memory poisoning * halfway through, might be a race with munmap, * might be severe memory pressure. */ if (chunk > PAGE_SIZE) chunk /= 2; if (copied) { bytes = copied; goto retry; } /* * 'folio' is now unlocked and faults on it can be * handled. Ensure forward progress by trying to * fault it in now. */ if (fault_in_iov_iter_readable(i, bytes) == bytes) { status = -EFAULT; break; } } else { pos += status; written += status; } } while (iov_iter_count(i)); if (!written) return status; iocb->ki_pos += written; return written; } EXPORT_SYMBOL(generic_perform_write); /** * __generic_file_write_iter - write data to a file * @iocb: IO state structure (file, offset, etc.) * @from: iov_iter with data to write * * This function does all the work needed for actually writing data to a * file. It does all basic checks, removes SUID from the file, updates * modification times and calls proper subroutines depending on whether we * do direct IO or a standard buffered write. * * It expects i_rwsem to be grabbed unless we work on a block device or similar * object which does not need locking at all. * * This function does *not* take care of syncing data in case of O_SYNC write. * A caller has to handle it. This is mainly due to the fact that we want to * avoid syncing under i_rwsem. * * Return: * * number of bytes written, even for truncated writes * * negative error code if no data has been written at all */ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; ret = file_remove_privs(file); if (ret) return ret; ret = file_update_time(file); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT) { ret = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to * holes, for example. For DAX files, a buffered write will * not succeed (even if it did, DAX does not handle dirty * page-cache pages correctly). */ if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode)) return ret; return direct_write_fallback(iocb, from, ret, generic_perform_write(iocb, from)); } return generic_perform_write(iocb, from); } EXPORT_SYMBOL(__generic_file_write_iter); /** * generic_file_write_iter - write data to a file * @iocb: IO state structure * @from: iov_iter with data to write * * This is a wrapper around __generic_file_write_iter() to be used by most * filesystems. It takes care of syncing the file in case of O_SYNC file * and acquires i_rwsem as needed. * Return: * * negative error code if no data has been written at all of * vfs_fsync_range() failed for a synchronous write * * number of bytes written, even for truncated writes */ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); /** * filemap_release_folio() - Release fs-specific metadata on a folio. * @folio: The folio which the kernel is trying to free. * @gfp: Memory allocation flags (and I/O mode). * * The address_space is trying to release any data attached to a folio * (presumably at folio->private). * * This will also be called if the private_2 flag is set on a page, * indicating that the folio has other metadata associated with it. * * The @gfp argument specifies whether I/O may be performed to release * this page (__GFP_IO), and whether the call may block * (__GFP_RECLAIM & __GFP_FS). * * Return: %true if the release was successful, otherwise %false. */ bool filemap_release_folio(struct folio *folio, gfp_t gfp) { struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); if (!folio_needs_release(folio)) return true; if (folio_test_writeback(folio)) return false; if (mapping && mapping->a_ops->release_folio) return mapping->a_ops->release_folio(folio, gfp); return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); /** * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache * @inode: The inode to flush * @flush: Set to write back rather than simply invalidate. * @start: First byte to in range. * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start * onwards. * * Invalidate all the folios on an inode that contribute to the specified * range, possibly writing them back first. Whilst the operation is * undertaken, the invalidate lock is held to prevent new folios from being * installed. */ int filemap_invalidate_inode(struct inode *inode, bool flush, loff_t start, loff_t end) { struct address_space *mapping = inode->i_mapping; pgoff_t first = start >> PAGE_SHIFT; pgoff_t last = end >> PAGE_SHIFT; pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1; if (!mapping || !mapping->nrpages || end < start) goto out; /* Prevent new folios from being added to the inode. */ filemap_invalidate_lock(mapping); if (!mapping->nrpages) goto unlock; unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ if (flush) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = start, .range_end = end, }; filemap_fdatawrite_wbc(mapping, &wbc); } /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); unlock: filemap_invalidate_unlock(mapping); out: return filemap_check_errors(mapping); } EXPORT_SYMBOL_GPL(filemap_invalidate_inode); #ifdef CONFIG_CACHESTAT_SYSCALL /** * filemap_cachestat() - compute the page cache statistics of a mapping * @mapping: The mapping to compute the statistics for. * @first_index: The starting page cache index. * @last_index: The final page index (inclusive). * @cs: the cachestat struct to write the result to. * * This will query the page cache statistics of a mapping in the * page range of [first_index, last_index] (inclusive). The statistics * queried include: number of dirty pages, number of pages marked for * writeback, and the number of (recently) evicted pages. */ static void filemap_cachestat(struct address_space *mapping, pgoff_t first_index, pgoff_t last_index, struct cachestat *cs) { XA_STATE(xas, &mapping->i_pages, first_index); struct folio *folio; /* Flush stats (and potentially sleep) outside the RCU read section. */ mem_cgroup_flush_stats_ratelimited(NULL); rcu_read_lock(); xas_for_each(&xas, folio, last_index) { int order; unsigned long nr_pages; pgoff_t folio_first_index, folio_last_index; /* * Don't deref the folio. It is not pinned, and might * get freed (and reused) underneath us. * * We *could* pin it, but that would be expensive for * what should be a fast and lightweight syscall. * * Instead, derive all information of interest from * the rcu-protected xarray. */ if (xas_retry(&xas, folio)) continue; order = xas_get_order(&xas); nr_pages = 1 << order; folio_first_index = round_down(xas.xa_index, 1 << order); folio_last_index = folio_first_index + nr_pages - 1; /* Folios might straddle the range boundaries, only count covered pages */ if (folio_first_index < first_index) nr_pages -= first_index - folio_first_index; if (folio_last_index > last_index) nr_pages -= folio_last_index - last_index; if (xa_is_value(folio)) { /* page is evicted */ void *shadow = (void *)folio; bool workingset; /* not used */ cs->nr_evicted += nr_pages; #ifdef CONFIG_SWAP /* implies CONFIG_MMU */ if (shmem_mapping(mapping)) { /* shmem file - in swap cache */ swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ if (non_swap_entry(swp)) goto resched; /* * Getting a swap entry from the shmem * inode means we beat * shmem_unuse(). rcu_read_lock() * ensures swapoff waits for us before * freeing the swapper space. However, * we can race with swapping and * invalidation, so there might not be * a shadow in the swapcache (yet). */ shadow = get_shadow_from_swap_cache(swp); if (!shadow) goto resched; } #endif if (workingset_test_recent(shadow, true, &workingset, false)) cs->nr_recently_evicted += nr_pages; goto resched; } /* page is in cache */ cs->nr_cache += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY)) cs->nr_dirty += nr_pages; if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK)) cs->nr_writeback += nr_pages; resched: if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); } /* * See mincore: reveal pagecache information only for files * that the calling process has write access to, or could (if * tried) open for writing. */ static inline bool can_do_cachestat(struct file *f) { if (f->f_mode & FMODE_WRITE) return true; if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f))) return true; return file_permission(f, MAY_WRITE) == 0; } /* * The cachestat(2) system call. * * cachestat() returns the page cache statistics of a file in the * bytes range specified by `off` and `len`: number of cached pages, * number of dirty pages, number of pages marked for writeback, * number of evicted pages, and number of recently evicted pages. * * An evicted page is a page that is previously in the page cache * but has been evicted since. A page is recently evicted if its last * eviction was recent enough that its reentry to the cache would * indicate that it is actively being used by the system, and that * there is memory pressure on the system. * * `off` and `len` must be non-negative integers. If `len` > 0, * the queried range is [`off`, `off` + `len`]. If `len` == 0, * we will query in the range from `off` to the end of the file. * * The `flags` argument is unused for now, but is included for future * extensibility. User should pass 0 (i.e no flag specified). * * Currently, hugetlbfs is not supported. * * Because the status of a page can change after cachestat() checks it * but before it returns to the application, the returned values may * contain stale information. * * return values: * zero - success * -EFAULT - cstat or cstat_range points to an illegal address * -EINVAL - invalid flags * -EBADF - invalid file descriptor * -EOPNOTSUPP - file descriptor is of a hugetlbfs file */ SYSCALL_DEFINE4(cachestat, unsigned int, fd, struct cachestat_range __user *, cstat_range, struct cachestat __user *, cstat, unsigned int, flags) { CLASS(fd, f)(fd); struct address_space *mapping; struct cachestat_range csr; struct cachestat cs; pgoff_t first_index, last_index; if (fd_empty(f)) return -EBADF; if (copy_from_user(&csr, cstat_range, sizeof(struct cachestat_range))) return -EFAULT; /* hugetlbfs is not supported */ if (is_file_hugepages(fd_file(f))) return -EOPNOTSUPP; if (!can_do_cachestat(fd_file(f))) return -EPERM; if (flags != 0) return -EINVAL; first_index = csr.off >> PAGE_SHIFT; last_index = csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT; memset(&cs, 0, sizeof(struct cachestat)); mapping = fd_file(f)->f_mapping; filemap_cachestat(mapping, first_index, last_index, &cs); if (copy_to_user(cstat, &cs, sizeof(struct cachestat))) return -EFAULT; return 0; } #endif /* CONFIG_CACHESTAT_SYSCALL */
3 10 9 1 8 5 4 1 1 2 2 19 1 6 1 1 1 70 1 1 1 3 3 3 4 1 3 3 3 4 4 4 4 4 8 1 2 5 1 4 3 3 3 2 2 1 1 6 3 3 3 10 274 273 275 1 1 508 40 302 89 25 14 1 2 2 2 1 2 1 18 23 1 285 8 4 9 1 1192 1197 1194 1159 206 206 868 862 3 837 26 308 2 1 81 82 82 77 5 256 1 29 3 23 178 184 293 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> #include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> #include <net/netdev_lock.h> #include <net/wext.h> #include "dev.h" /* * Map an interface index to its name (SIOCGIFNAME) */ /* * We need this ioctl for efficient implementation of the * if_indextoname() function required by the IPv6 API. Without * it, we would have to search all the interfaces to find a * match. --pb */ static int dev_ifname(struct net *net, struct ifreq *ifr) { ifr->ifr_name[IFNAMSIZ-1] = 0; return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; void __user *pos; size_t size; int len, total = 0, done; /* both the ifconf and the ifreq structures are slightly different */ if (in_compat_syscall()) { struct compat_ifconf ifc32; if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) return -EFAULT; pos = compat_ptr(ifc32.ifcbuf); len = ifc32.ifc_len; size = sizeof(struct compat_ifreq); } else { struct ifconf ifc; if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) return -EFAULT; pos = ifc.ifc_buf; len = ifc.ifc_len; size = sizeof(struct ifreq); } /* Loop over the interfaces, and write an info block for each. */ rtnl_net_lock(net); for_each_netdev(net, dev) { if (!pos) done = inet_gifconf(dev, NULL, 0, size); else done = inet_gifconf(dev, pos + total, len - total, size); if (done < 0) { rtnl_net_unlock(net); return -EFAULT; } total += done; } rtnl_net_unlock(net); return put_user(total, &uifc->ifc_len); } static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) { struct ifmap *ifmap = &ifr->ifr_map; if (in_compat_syscall()) { struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; cifmap->mem_start = dev->mem_start; cifmap->mem_end = dev->mem_end; cifmap->base_addr = dev->base_addr; cifmap->irq = dev->irq; cifmap->dma = dev->dma; cifmap->port = dev->if_port; return 0; } ifmap->mem_start = dev->mem_start; ifmap->mem_end = dev->mem_end; ifmap->base_addr = dev->base_addr; ifmap->irq = dev->irq; ifmap->dma = dev->dma; ifmap->port = dev->if_port; return 0; } static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; if (!dev->netdev_ops->ndo_set_config) return -EOPNOTSUPP; if (in_compat_syscall()) { struct ifmap ifmap = { .mem_start = cifmap->mem_start, .mem_end = cifmap->mem_end, .base_addr = cifmap->base_addr, .irq = cifmap->irq, .dma = cifmap->dma, .port = cifmap->port, }; return dev->netdev_ops->ndo_set_config(dev, &ifmap); } return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); } /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name); if (!dev) return -ENODEV; switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ ifr->ifr_flags = (short) dev_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface (currently unused) */ ifr->ifr_metric = 0; return 0; case SIOCGIFMTU: /* Get the MTU of a device */ ifr->ifr_mtu = dev->mtu; return 0; case SIOCGIFSLAVE: err = -EINVAL; break; case SIOCGIFMAP: return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; return 0; case SIOCGIFTXQLEN: ifr->ifr_qlen = dev->tx_queue_len; return 0; default: /* dev_ioctl() should ensure this case * is never reached */ WARN_ON(1); err = -ENOTTY; break; } return err; } int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; int tx_type_valid = 0; int rx_filter_valid = 0; if (cfg->flags & ~HWTSTAMP_FLAG_MASK) return -EINVAL; tx_type = cfg->tx_type; rx_filter = cfg->rx_filter; switch (tx_type) { case HWTSTAMP_TX_OFF: case HWTSTAMP_TX_ON: case HWTSTAMP_TX_ONESTEP_SYNC: case HWTSTAMP_TX_ONESTEP_P2P: tx_type_valid = 1; break; case __HWTSTAMP_TX_CNT: /* not a real value */ break; } switch (rx_filter) { case HWTSTAMP_FILTER_NONE: case HWTSTAMP_FILTER_ALL: case HWTSTAMP_FILTER_SOME: case HWTSTAMP_FILTER_PTP_V1_L4_EVENT: case HWTSTAMP_FILTER_PTP_V1_L4_SYNC: case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; case __HWTSTAMP_FILTER_CNT: /* not a real value */ break; } if (!tx_type_valid || !rx_filter_valid) return -ERANGE; return 0; } /** * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * * Helper for calling the default hardware provider timestamping. * * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and * there only exists a phydev->mii_ts->hwtstamp() method. So this will return * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct hwtstamp_provider *hwprov; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { cfg->qualifier = hwprov->desc.qualifier; if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && hwprov->phydev) return phy_hwtstamp_get(hwprov->phydev, cfg); if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); return -EOPNOTSUPP; } if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); } static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct hwtstamp_config cfg; int err; if (!ops->ndo_hwtstamp_get) return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; kernel_cfg.ifr = ifr; netdev_lock_ops(dev); err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); netdev_unlock_ops(dev); if (err) return err; /* If the request was resolved through an unconverted driver, omit * the copy_to_user(), since the implementation has already done that */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } /** * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC * or of attached phylib PHY * @dev: Network device * @cfg: Timestamping configuration structure * @extack: Netlink extended ack message structure, for error reporting * * Helper for enforcing a common policy that phylib timestamping, if available, * should take precedence in front of hardware timestamping provided by the * netdev. If the netdev driver needs to perform specific actions even for PHY * timestamping to work properly (a switch port must trap the timestamped * frames and not forward them), it must set dev->see_all_hwtstamp_requests. */ int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config old_cfg = {}; struct hwtstamp_provider *hwprov; struct phy_device *phydev; bool changed = false; bool phy_ts; int err; hwprov = rtnl_dereference(dev->hwprov); if (hwprov) { if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && hwprov->phydev) { phy_ts = true; phydev = hwprov->phydev; } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { phy_ts = false; } else { return -EOPNOTSUPP; } cfg->qualifier = hwprov->desc.qualifier; } else { phy_ts = phy_is_default_hwtstamp(dev->phydev); if (phy_ts) phydev = dev->phydev; } cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_get(dev, &old_cfg); if (err) return err; } if (!phy_ts || dev->see_all_hwtstamp_requests) { err = ops->ndo_hwtstamp_set(dev, cfg, extack); if (err) { if (extack->_msg) netdev_err(dev, "%s\n", extack->_msg); return err; } } if (phy_ts && dev->see_all_hwtstamp_requests) changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); return err; } } return 0; } static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) { const struct net_device_ops *ops = dev->netdev_ops; struct kernel_hwtstamp_config kernel_cfg = {}; struct netlink_ext_ack extack = {}; struct hwtstamp_config cfg; int err; if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; hwtstamp_config_to_kernel(&kernel_cfg, &cfg); kernel_cfg.ifr = ifr; err = net_hwtstamp_validate(&kernel_cfg); if (err) return err; err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack); if (err) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return err; } if (!ops->ndo_hwtstamp_set) return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); netdev_unlock_ops(dev); if (err) return err; /* The driver may have modified the configuration, so copy the * updated version of it back to user space */ if (!kernel_cfg.copied_to_user) { hwtstamp_config_from_kernel(&cfg, &kernel_cfg); if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } return 0; } static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq ifrr; int err; strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; err = dev_eth_ioctl(dev, &ifrr, cmd); if (err) return err; kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; kernel_cfg->copied_to_user = true; return 0; } int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_get) return dev_get_hwtstamp_phylib(dev, kernel_cfg); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_get_lower); int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; if (!netif_device_present(dev)) return -ENODEV; if (ops->ndo_hwtstamp_set) return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); } EXPORT_SYMBOL(generic_hwtstamp_set_lower); static int dev_siocbond(struct net_device *dev, struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocbond(dev, ifr, cmd); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { int ret = -ENODEV; netdev_lock_ops(dev); if (netif_device_present(dev)) ret = ops->ndo_siocwandev(dev, ifs); netdev_unlock_ops(dev); return ret; } return -EOPNOTSUPP; } /* * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; if (!dev) return -ENODEV; ops = dev->netdev_ops; switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ return dev_change_flags(dev, ifr->ifr_flags, NULL); case SIOCSIFMETRIC: /* Set the metric on the interface (currently unused) */ return -EOPNOTSUPP; case SIOCSIFMTU: /* Set the MTU of a device */ return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: if (dev->addr_len > sizeof(struct sockaddr)) return -EINVAL; return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: netdev_lock_ops(dev); err = netif_setifmap(dev, ifr); netdev_unlock_ops(dev); return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); netdev_unlock_ops(dev); return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; netdev_lock_ops(dev); err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); netdev_unlock_ops(dev); return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) return -EINVAL; return dev_change_tx_queue_len(dev, ifr->ifr_qlen); case SIOCSIFNAME: ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); case SIOCSHWTSTAMP: return dev_set_hwtstamp(dev, ifr); case SIOCGHWTSTAMP: return dev_get_hwtstamp(dev, ifr); case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: return dev_eth_ioctl(dev, ifr, cmd); case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCBONDCHANGEACTIVE: return dev_siocbond(dev, ifr, cmd); /* Unknown ioctl */ default: err = -EINVAL; } return err; } /** * dev_load - load a network module * @net: the applicable net namespace * @name: name of interface * * If a network interface is not present and the process has suitable * privileges this function loads the module. If module loading is not * available in this kernel then it becomes a nop. */ void dev_load(struct net *net, const char *name) { struct net_device *dev; int no_module; rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); rcu_read_unlock(); no_module = !dev; if (no_module && capable(CAP_NET_ADMIN)) no_module = request_module("netdev-%s", name); if (no_module && capable(CAP_SYS_MODULE)) request_module("%s", name); } EXPORT_SYMBOL(dev_load); /* * This function handles all "interface"-type I/O control requests. The actual * 'doing' part of this is dev_ifsioc above. */ /** * dev_ioctl - network device ioctl * @net: the applicable net namespace * @cmd: command to issue * @ifr: pointer to a struct ifreq in user space * @data: data exchanged with userspace * @need_copyout: whether or not copy_to_user() should be called * * Issue ioctl functions to devices. This is normally called by the * user space syscall interfaces but can sometimes be useful for * other purposes. The return value is the return from the syscall if * positive or a negative errno code on error. */ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, void __user *data, bool *need_copyout) { int ret; char *colon; if (need_copyout) *need_copyout = true; if (cmd == SIOCGIFNAME) return dev_ifname(net, ifr); ifr->ifr_name[IFNAMSIZ-1] = 0; colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; /* * See which interface the caller is talking about. */ switch (cmd) { case SIOCGIFHWADDR: dev_load(net, ifr->ifr_name); ret = dev_get_mac_address(&ifr->ifr_hwaddr, net, ifr->ifr_name); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - can be done by all. * - atomic and do not require locking. * - return a value */ case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: case SIOCGIFSLAVE: case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: dev_load(net, ifr->ifr_name); rcu_read_lock(); ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); if (colon) *colon = ':'; return ret; case SIOCETHTOOL: dev_load(net, ifr->ifr_name); ret = dev_ethtool(net, ifr, data); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - return a value */ case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSIFNAME: dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); if (colon) *colon = ':'; return ret; /* * These ioctl calls: * - require superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFMAP: case SIOCSIFTXQLEN: if (!capable(CAP_NET_ADMIN)) return -EPERM; fallthrough; /* * These ioctl calls: * - require local superuser power. * - require strict serialization. * - do not return a value */ case SIOCSIFFLAGS: case SIOCSIFMETRIC: case SIOCSIFMTU: case SIOCSIFHWADDR: case SIOCSIFSLAVE: case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFHWBROADCAST: case SIOCSMIIREG: case SIOCBONDENSLAVE: case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); if (need_copyout) *need_copyout = false; return ret; case SIOCGIFMEM: /* Get the per device memory space. We can add this but * currently do not support it */ case SIOCSIFMEM: /* Set the per device memory buffer space. * Not applicable in our case */ case SIOCSIFLINK: return -ENOTTY; /* * Unknown or private ioctl. */ default: if (cmd == SIOCWANDEV || cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); rtnl_net_unlock(net); return ret; } return -ENOTTY; } }
96 94 94 94 94 94 94 94 94 94 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 // SPDX-License-Identifier: GPL-2.0-or-later /* Maintain an RxRPC server socket to do AFS communications through * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "internal.h" #include "afs_cm.h" #include "protocol_yfs.h" #define RXRPC_TRACE_ONLY_DEFINE_ENUMS #include <trace/events/rxrpc.h> struct workqueue_struct *afs_async_calls; static void afs_deferred_free_worker(struct work_struct *work); static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long); static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { .notify_new_call = afs_rx_new_call, .discard_new_call = afs_rx_discard_new_call, .user_attach_call = afs_rx_attach, .notify_oob = afs_rx_notify_oob, }; /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", .deliver = afs_deliver_cm_op_id, }; /* * open an RxRPC socket and bind it to be a server for callback notifications * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT */ int afs_open_socket(struct afs_net *net) { struct sockaddr_rxrpc srx; struct socket *socket; int ret; _enter(""); ret = sock_create_kern(net->net, AF_RXRPC, SOCK_DGRAM, PF_INET6, &socket); if (ret < 0) goto error_1; socket->sk->sk_allocation = GFP_NOFS; socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); srx.srx_family = AF_RXRPC; srx.srx_service = CM_SERVICE; srx.transport_type = SOCK_DGRAM; srx.transport_len = sizeof(srx.transport.sin6); srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(AFS_CM_PORT); ret = rxrpc_sock_set_min_security_level(socket->sk, RXRPC_SECURITY_ENCRYPT); if (ret < 0) goto error_2; ret = rxrpc_sock_set_manage_response(socket->sk, true); if (ret < 0) goto error_2; ret = afs_create_token_key(net, socket); if (ret < 0) pr_err("Couldn't create RxGK CM key: %d\n", ret); ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); } if (ret < 0) goto error_2; srx.srx_service = YFS_CM_SERVICE; ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret < 0) goto error_2; /* Ideally, we'd turn on service upgrade here, but we can't because * OpenAFS is buggy and leaks the userStatus field from packet to * packet and between FS packets and CB packets - so if we try to do an * upgrade on an FS packet, OpenAFS will leak that into the CB packet * it sends back to us. */ rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) goto error_2; net->socket = socket; afs_charge_preallocation(&net->charge_preallocation_work); _leave(" = 0"); return 0; error_2: sock_release(socket); error_1: _leave(" = %d", ret); return ret; } /* * close the RxRPC socket AFS was using */ void afs_close_socket(struct afs_net *net) { _enter(""); kernel_listen(net->socket, 0); flush_workqueue(afs_async_calls); if (net->spare_incoming_call) { afs_put_call(net->spare_incoming_call); net->spare_incoming_call = NULL; } _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); wait_var_event(&net->nr_outstanding_calls, !atomic_read(&net->nr_outstanding_calls)); _debug("no outstanding calls"); kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); net->socket->sk->sk_user_data = NULL; sock_release(net->socket); key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); } /* * Allocate a call. */ static struct afs_call *afs_alloc_call(struct afs_net *net, const struct afs_call_type *type, gfp_t gfp) { struct afs_call *call; int o; call = kzalloc(sizeof(*call), gfp); if (!call) return NULL; call->type = type; call->net = net; call->debug_id = atomic_inc_return(&rxrpc_debug_id); refcount_set(&call->ref, 1); INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call); INIT_WORK(&call->work, call->type->work); INIT_WORK(&call->free_work, afs_deferred_free_worker); init_waitqueue_head(&call->waitq); spin_lock_init(&call->state_lock); call->iter = &call->def_iter; o = atomic_inc_return(&net->nr_outstanding_calls); trace_afs_call(call->debug_id, afs_call_trace_alloc, 1, o, __builtin_return_address(0)); return call; } static void afs_free_call(struct afs_call *call) { struct afs_net *net = call->net; int o; ASSERT(!work_pending(&call->async_work)); rxrpc_kernel_put_peer(call->peer); if (call->rxcall) { rxrpc_kernel_shutdown_call(net->socket, call->rxcall); rxrpc_kernel_put_call(net->socket, call->rxcall); call->rxcall = NULL; } if (call->type->destructor) call->type->destructor(call); afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); kfree(call->request); o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(call->debug_id, afs_call_trace_free, 0, o, __builtin_return_address(0)); kfree(call); o = atomic_dec_return(&net->nr_outstanding_calls); if (o == 0) wake_up_var(&net->nr_outstanding_calls); } /* * Dispose of a reference on a call. */ void afs_put_call(struct afs_call *call) { struct afs_net *net = call->net; unsigned int debug_id = call->debug_id; bool zero; int r, o; zero = __refcount_dec_and_test(&call->ref, &r); o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); if (zero) afs_free_call(call); } static void afs_deferred_free_worker(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, free_work); afs_free_call(call); } /* * Dispose of a reference on a call, deferring the cleanup to a workqueue * to avoid lock recursion. */ void afs_deferred_put_call(struct afs_call *call) { struct afs_net *net = call->net; unsigned int debug_id = call->debug_id; bool zero; int r, o; zero = __refcount_dec_and_test(&call->ref, &r); o = atomic_read(&net->nr_outstanding_calls); trace_afs_call(debug_id, afs_call_trace_put, r - 1, o, __builtin_return_address(0)); if (zero) schedule_work(&call->free_work); } /* * Queue the call for actual work. */ static void afs_queue_call_work(struct afs_call *call) { if (call->type->work) { afs_get_call(call, afs_call_trace_work); if (!queue_work(afs_wq, &call->work)) afs_put_call(call); } } /* * allocate a call with flat request and reply buffers */ struct afs_call *afs_alloc_flat_call(struct afs_net *net, const struct afs_call_type *type, size_t request_size, size_t reply_max) { struct afs_call *call; call = afs_alloc_call(net, type, GFP_NOFS); if (!call) goto nomem_call; if (request_size) { call->request_size = request_size; call->request = kmalloc(request_size, GFP_NOFS); if (!call->request) goto nomem_free; } if (reply_max) { call->reply_max = reply_max; call->buffer = kmalloc(reply_max, GFP_NOFS); if (!call->buffer) goto nomem_free; } afs_extract_to_buf(call, call->reply_max); call->operation_ID = type->op; init_waitqueue_head(&call->waitq); return call; nomem_free: afs_put_call(call); nomem_call: return NULL; } /* * clean up a call with flat buffer */ void afs_flat_call_destructor(struct afs_call *call) { _enter(""); kfree(call->request); call->request = NULL; kfree(call->buffer); call->buffer = NULL; } /* * Advance the AFS call state when the RxRPC call ends the transmit phase. */ static void afs_notify_end_request_tx(struct sock *sock, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; afs_set_call_state(call, AFS_CALL_CL_REQUESTING, AFS_CALL_CL_AWAIT_REPLY); } /* * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ void afs_make_call(struct afs_call *call, gfp_t gfp) { struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; size_t len; s64 tx_total_len; int ret; _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id); ASSERT(call->type != NULL); ASSERT(call->type->name != NULL); _debug("____MAKE %p{%s,%x} [%d]____", call, call->type->name, key_serial(call->key), atomic_read(&call->net->nr_outstanding_calls)); trace_afs_make_call(call); /* Work out the length we're going to transmit. This is awkward for * calls such as FS.StoreData where there's an extra injection of data * after the initial fixed part. */ tx_total_len = call->request_size; if (call->write_iter) tx_total_len += iov_iter_count(call->write_iter); /* If the call is going to be asynchronous, we need an extra ref for * the call to hold itself so the caller need not hang on to its ref. */ if (call->async) { afs_get_call(call, afs_call_trace_get); call->drop_ref = true; } /* create a call */ rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key, (unsigned long)call, tx_total_len, call->max_lifespan, gfp, (call->async ? afs_wake_up_async_call : afs_wake_up_call_waiter), call->service_id, call->upgrade, (call->intr ? RXRPC_PREINTERRUPTIBLE : RXRPC_UNINTERRUPTIBLE), call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); call->error = ret; goto error_kill_call; } call->rxcall = rxcall; call->issue_time = ktime_get_real(); /* send the request */ iov[0].iov_base = call->request; iov[0].iov_len = call->request_size; msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; if (call->write_iter) { msg.msg_iter = *call->write_iter; msg.msg_flags &= ~MSG_MORE; trace_afs_send_data(call, &msg); ret = rxrpc_kernel_send_data(call->net->socket, call->rxcall, &msg, iov_iter_count(&msg.msg_iter), afs_notify_end_request_tx); *call->write_iter = msg.msg_iter; trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } /* Note that at this point, we may have received the reply or an abort * - and an asynchronous call may already have completed. * * afs_wait_for_call_to_complete(call) * must be called to synchronously clean up. */ return; error_do_abort: if (ret != -ECONNABORTED) rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, afs_abort_send_data_error); if (call->async) { afs_see_call(call, afs_call_trace_async_abort); return; } if (ret == -ECONNABORTED) { len = 0; iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); call->responded = true; } call->error = ret; trace_afs_call_done(call); error_kill_call: if (call->async) afs_see_call(call, afs_call_trace_async_kill); if (call->type->immediate_cancel) call->type->immediate_cancel(call); /* We need to dispose of the extra ref we grabbed for an async call. * The call, however, might be queued on afs_async_calls and we need to * make sure we don't get any more notifications that might requeue it. */ if (call->rxcall) rxrpc_kernel_shutdown_call(call->net->socket, call->rxcall); if (call->async) { if (cancel_work_sync(&call->async_work)) afs_put_call(call); afs_set_call_complete(call, ret, 0); } call->error = ret; call->state = AFS_CALL_COMPLETE; _leave(" = %d", ret); } /* * Log remote abort codes that indicate that we have a protocol disagreement * with the server. */ static void afs_log_error(struct afs_call *call, s32 remote_abort) { static int max = 0; const char *msg; int m; switch (remote_abort) { case RX_EOF: msg = "unexpected EOF"; break; case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; case RXGEN_DECODE: msg = "opcode decode"; break; case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; case -32: msg = "insufficient data"; break; default: return; } m = max; if (m < 3) { max = m + 1; pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", msg, call->type->name, rxrpc_kernel_remote_addr(call->peer)); } } /* * deliver messages to a call */ void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; size_t len; u32 abort_code, remote_abort = 0; int ret; _enter("%s", call->type->name); while (state = READ_ONCE(call->state), state == AFS_CALL_CL_AWAIT_REPLY || state == AFS_CALL_SV_AWAIT_OP_ID || state == AFS_CALL_SV_AWAIT_REQUEST || state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { len = 0; iov_iter_kvec(&call->def_iter, ITER_DEST, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, &len, false, &remote_abort, &call->service_id); trace_afs_receive_data(call, &call->def_iter, false, ret); if (ret == -EINPROGRESS || ret == -EAGAIN) return; if (ret < 0 || ret == 1) { if (ret == 1) ret = 0; goto call_complete; } return; } ret = call->type->deliver(call); state = READ_ONCE(call->state); if (ret == 0 && call->unmarshalling_error) ret = -EBADMSG; switch (ret) { case 0: call->responded = true; afs_queue_call_work(call); if (state == AFS_CALL_CL_PROC_REPLY) { if (call->op) set_bit(AFS_SERVER_FL_MAY_HAVE_CB, &call->op->server->flags); goto call_complete; } ASSERTCMP(state, >, AFS_CALL_CL_PROC_REPLY); goto done; case -EINPROGRESS: case -EAGAIN: goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); call->responded = true; afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: call->responded = true; abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, afs_abort_op_not_supported); goto local_abort; case -EIO: pr_err("kAFS: Call %u in bad state %u\n", call->debug_id, state); fallthrough; case -ENODATA: case -EBADMSG: case -EMSGSIZE: case -ENOMEM: case -EFAULT: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, afs_abort_unmarshal_error); goto local_abort; default: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, afs_abort_general_error); goto local_abort; } } done: if (call->type->done) call->type->done(call); out: _leave(""); return; local_abort: abort_code = 0; call_complete: afs_set_call_complete(call, ret, remote_abort); goto done; } /* * Wait synchronously for a call to complete. */ void afs_wait_for_call_to_complete(struct afs_call *call) { bool rxrpc_complete = false; _enter(""); if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { DECLARE_WAITQUEUE(myself, current); add_wait_queue(&call->waitq, &myself); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); /* deliver any messages that are in the queue */ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && call->need_attention) { call->need_attention = false; __set_current_state(TASK_RUNNING); afs_deliver_to_call(call); continue; } if (afs_check_call_state(call, AFS_CALL_COMPLETE)) break; if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { /* rxrpc terminated the call. */ rxrpc_complete = true; break; } schedule(); } remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); } if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { if (rxrpc_complete) { afs_set_call_complete(call, call->error, call->abort_code); } else { /* Kill off the call if it's still live. */ _debug("call interrupted"); if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall, RX_USER_ABORT, -EINTR, afs_abort_interrupted)) afs_set_call_complete(call, -EINTR, 0); } } } /* * wake up a waiting call */ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; call->need_attention = true; wake_up(&call->waitq); } /* * Wake up an asynchronous call. The caller is holding the call notify * spinlock around this, so we can't call afs_put_call(). */ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; int r; trace_afs_notify_call(rxcall, call); call->need_attention = true; if (__refcount_inc_not_zero(&call->ref, &r)) { trace_afs_call(call->debug_id, afs_call_trace_wake, r + 1, atomic_read(&call->net->nr_outstanding_calls), __builtin_return_address(0)); if (!queue_work(afs_async_calls, &call->async_work)) afs_deferred_put_call(call); } } /* * Perform I/O processing on an asynchronous call. The work item carries a ref * to the call struct that we either need to release or to pass on. */ static void afs_process_async_call(struct work_struct *work) { struct afs_call *call = container_of(work, struct afs_call, async_work); _enter(""); if (call->state < AFS_CALL_COMPLETE && call->need_attention) { call->need_attention = false; afs_deliver_to_call(call); } afs_put_call(call); _leave(""); } static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; call->rxcall = rxcall; } /* * Charge the incoming call preallocation. */ void afs_charge_preallocation(struct work_struct *work) { struct afs_net *net = container_of(work, struct afs_net, charge_preallocation_work); struct afs_call *call = net->spare_incoming_call; for (;;) { if (!call) { call = afs_alloc_call(net, &afs_RXCMxxxx, GFP_KERNEL); if (!call) break; call->drop_ref = true; call->async = true; call->state = AFS_CALL_SV_AWAIT_OP_ID; init_waitqueue_head(&call->waitq); afs_extract_to_tmp(call); } if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) break; call = NULL; } net->spare_incoming_call = call; } /* * Discard a preallocated call when a socket is shut down. */ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; call->rxcall = NULL; afs_put_call(call); } /* * Notification of an incoming call. */ static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { struct afs_call *call = (struct afs_call *)user_call_ID; struct afs_net *net = afs_sock2net(sk); call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); call->server = afs_find_server(call->peer); if (!call->server) trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); queue_work(afs_wq, &net->charge_preallocation_work); } /* * Grab the operation ID from an incoming cache manager call. The socket * buffer is discarded on error or if we don't yet have sufficient data. */ static int afs_deliver_cm_op_id(struct afs_call *call) { int ret; _enter("{%zu}", iov_iter_count(call->iter)); /* the operation ID forms the first four bytes of the request data */ ret = afs_extract_data(call, true); if (ret < 0) return ret; call->operation_ID = ntohl(call->tmp); afs_set_call_state(call, AFS_CALL_SV_AWAIT_OP_ID, AFS_CALL_SV_AWAIT_REQUEST); /* ask the cache manager to route the call (it'll change the call type * if successful) */ if (!afs_cm_incoming_call(call)) return -ENOTSUPP; call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, &call->service_id, &call->enctype); trace_afs_cb_call(call); call->work.func = call->type->work; /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } /* * Advance the AFS call state when an RxRPC service call ends the transmit * phase. */ static void afs_notify_end_reply_tx(struct sock *sock, struct rxrpc_call *rxcall, unsigned long call_user_ID) { struct afs_call *call = (struct afs_call *)call_user_ID; afs_set_call_state(call, AFS_CALL_SV_REPLYING, AFS_CALL_SV_AWAIT_ACK); } /* * send an empty reply */ void afs_send_empty_reply(struct afs_call *call) { struct afs_net *net = call->net; struct msghdr msg; _enter(""); rxrpc_kernel_set_tx_length(net->socket, call->rxcall, 0); msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; switch (rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, 0, afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RXGEN_SS_MARSHAL, -ENOMEM, afs_abort_oom); fallthrough; default: _leave(" [error]"); return; } } /* * send a simple reply */ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) { struct afs_net *net = call->net; struct msghdr msg; struct kvec iov[1]; int n; _enter(""); rxrpc_kernel_set_tx_length(net->socket, call->rxcall, len); iov[0].iov_base = (void *) buf; iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; n = rxrpc_kernel_send_data(net->socket, call->rxcall, &msg, len, afs_notify_end_reply_tx); if (n >= 0) { /* Success */ _leave(" [replied]"); return; } if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, RXGEN_SS_MARSHAL, -ENOMEM, afs_abort_oom); } _leave(" [error]"); } /* * Extract a piece of data from the received data socket buffers. */ int afs_extract_data(struct afs_call *call, bool want_more) { struct afs_net *net = call->net; struct iov_iter *iter = call->iter; enum afs_call_state state; u32 remote_abort = 0; int ret; _enter("{%s,%zu,%zu},%d", call->type->name, call->iov_len, iov_iter_count(iter), want_more); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, &call->iov_len, want_more, &remote_abort, &call->service_id); trace_afs_receive_data(call, call->iter, want_more, ret); if (ret == 0 || ret == -EAGAIN) return ret; state = READ_ONCE(call->state); if (ret == 1) { switch (state) { case AFS_CALL_CL_AWAIT_REPLY: afs_set_call_state(call, state, AFS_CALL_CL_PROC_REPLY); break; case AFS_CALL_SV_AWAIT_REQUEST: afs_set_call_state(call, state, AFS_CALL_SV_REPLYING); break; case AFS_CALL_COMPLETE: kdebug("prem complete %d", call->error); return afs_io_error(call, afs_io_error_extract); default: break; } return 0; } afs_set_call_complete(call, ret, remote_abort); return ret; } /* * Log protocol error production. */ noinline int afs_protocol_error(struct afs_call *call, enum afs_eproto_cause cause) { trace_afs_protocol_error(call, cause); if (call) call->unmarshalling_error = true; return -EBADMSG; } /* * Wake up OOB notification processing. */ static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) { struct afs_net *net = sk->sk_user_data; schedule_work(&net->rx_oob_work); }
121 125 25 25 25 25 10 25 64 64 64 63 1 64 64 64 3 61 64 62 2 64 12 12 1 14 14 2 13 13 11 11 2 2 2 2 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2015 Intel Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, u16 opcode, u16 len, void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; put_unaligned_le32(cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode, unsigned int size) { struct sk_buff *skb; skb = alloc_skb(sizeof(struct mgmt_hdr) + size, GFP_KERNEL); if (!skb) return skb; skb_reserve(skb, sizeof(struct mgmt_hdr)); bt_cb(skb)->mgmt.hdev = hdev; bt_cb(skb)->mgmt.opcode = opcode; return skb; } int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct hci_dev *hdev; struct mgmt_hdr *hdr; int len; if (!skb) return -EINVAL; len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ __net_timestamp(skb); /* Send just the data, without headers, to the monitor */ if (channel == HCI_CHANNEL_CONTROL) hci_send_monitor_ctrl_event(hdev, bt_cb(skb)->mgmt.opcode, skb->data, skb->len, skb_get_ktime(skb), flag, skip_sk); hdr = skb_push(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(bt_cb(skb)->mgmt.opcode); if (hdev) hdr->index = cpu_to_le16(hdev->id); else hdr->index = cpu_to_le16(MGMT_INDEX_NONE); hdr->len = cpu_to_le16(len); hci_send_to_channel(channel, skb, flag, skip_sk); kfree_skb(skb); return 0; } int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { struct sk_buff *skb; skb = mgmt_alloc_skb(hdev, event, data_len); if (!skb) return -ENOMEM; if (data) skb_put_data(skb, data, data_len); return mgmt_send_event_skb(channel, skb, flag, skip_sk); } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev)); ev = skb_put(skb, sizeof(*ev)); ev->status = status; ev->opcode = cpu_to_le16(cmd); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_STATUS, sizeof(*ev), ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; BT_DBG("sock %p", sk); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); ev = skb_put(skb, sizeof(*ev) + rp_len); ev->opcode = cpu_to_le16(cmd); ev->status = status; if (rp) memcpy(ev->data, rp, rp_len); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_COMPLETE, sizeof(*ev) + rp_len, ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd; list_for_each_entry(cmd, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; if (cmd->opcode == opcode) return cmd; } return NULL; } void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; cb(cmd, data); } } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) return NULL; cmd->opcode = opcode; cmd->index = hdev->id; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { kfree(cmd); return NULL; } cmd->param_len = len; cmd->sk = sk; sock_hold(sk); return cmd; } struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = mgmt_pending_new(sk, opcode, hdev, data, len); if (!cmd) return NULL; list_add_tail(&cmd->list, &hdev->mgmt_pending); return cmd; } void mgmt_pending_free(struct mgmt_pending_cmd *cmd) { sock_put(cmd->sk); kfree(cmd->param); kfree(cmd); } void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { list_del(&cmd->list); mgmt_pending_free(cmd); } void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx, *tmp; list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } } struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (mesh_tx->handle == handle) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mesh_tx *mesh_tx; mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL); if (!mesh_tx) return NULL; hdev->mesh_send_ref++; if (!hdev->mesh_send_ref) hdev->mesh_send_ref++; mesh_tx->handle = hdev->mesh_send_ref; mesh_tx->index = hdev->id; memcpy(mesh_tx->param, data, len); mesh_tx->param_len = len; mesh_tx->sk = sk; sock_hold(sk); list_add_tail(&mesh_tx->list, &hdev->mesh_pending); return mesh_tx; } void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx) { list_del(&mesh_tx->list); sock_put(mesh_tx->sk); kfree(mesh_tx); }
10 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SEQ_FILE_H #define _LINUX_SEQ_FILE_H #include <linux/types.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> struct seq_operations; struct seq_file { char *buf; size_t size; size_t from; size_t count; size_t pad_until; loff_t index; loff_t read_pos; struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; void *private; }; struct seq_operations { void * (*start) (struct seq_file *m, loff_t *pos); void (*stop) (struct seq_file *m, void *v); void * (*next) (struct seq_file *m, void *v, loff_t *pos); int (*show) (struct seq_file *m, void *v); }; #define SEQ_SKIP 1 /** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */ static inline bool seq_has_overflowed(struct seq_file *m) { return m->count == m->size; } /** * seq_get_buf - get buffer to write arbitrary data to * @m: the seq_file handle * @bufp: the beginning of the buffer is stored here * * Return the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_get_buf(struct seq_file *m, char **bufp) { BUG_ON(m->count > m->size); if (m->count < m->size) *bufp = m->buf + m->count; else *bufp = NULL; return m->size - m->count; } /** * seq_commit - commit data to the buffer * @m: the seq_file handle * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired * by seq_buf_get. To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_commit(struct seq_file *m, int num) { if (num < 0) { m->count = m->size; } else { BUG_ON(m->count + num > m->size); m->count += num; } } /** * seq_setwidth - set padding width * @m: the seq_file handle * @size: the max number of bytes to pad. * * Call seq_setwidth() for setting max width, then call seq_printf() etc. and * finally call seq_pad() to pad the remaining bytes. */ static inline void seq_setwidth(struct seq_file *m, size_t size) { m->pad_until = m->count + size; } void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); __printf(2, 0) void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void __seq_puts(struct seq_file *m, const char *s); static __always_inline void seq_puts(struct seq_file *m, const char *s) { if (!__builtin_constant_p(*s)) __seq_puts(m, s); else if (s[0] && !s[1]) seq_putc(m, s[0]); else seq_write(m, s, __builtin_strlen(s)); } void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc); static inline void seq_escape_str(struct seq_file *m, const char *src, unsigned int flags, const char *esc) { seq_escape_mem(m, src, strlen(src), flags, esc); } /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: NULL-terminated string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * * Use seq_has_overflowed() to check for errors. */ static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); int seq_path(struct seq_file *, const struct path *, const char *); int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); void *single_start(struct seq_file *, loff_t *); int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF __printf(2, 0) void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ int ret = seq_open(file, &__name ## _sops); \ if (!ret && inode->i_private) { \ struct seq_file *seq_f = file->private_data; \ seq_f->private = inode->i_private; \ } \ return ret; \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = seq_release, \ } #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .write = __name ## _write, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ .proc_open = __name ## _open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ } static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; #endif } /** * seq_show_options - display mount options with appropriate escapes. * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, can be NULL */ static inline void seq_show_option(struct seq_file *m, const char *name, const char *value) { seq_putc(m, ','); seq_escape(m, name, ",= \t\n\\"); if (value) { seq_putc(m, '='); seq_escape(m, value, ", \t\n\\"); } } /** * seq_show_option_n - display mount options with appropriate escapes * where @value must be a specific length (i.e. * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files */ extern struct list_head *seq_list_start(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); /* * Helpers for iteration over hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos); extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); /* Helpers for iterating over per-cpu hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos); extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); void seq_file_init(void); #endif
157 689 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_NS_H #define _LINUX_PID_NS_H #include <linux/sched.h> #include <linux/bug.h> #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/threads.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/idr.h> /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ #define MAX_PID_NS_LEVEL 32 struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) /* modes for vm.memfd_noexec sysctl */ #define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ #define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; unsigned int level; int pid_max; struct pid_namespace *parent; #ifdef CONFIG_BSD_PROCESS_ACCT struct fs_pin *bacct; #endif struct user_namespace *user_ns; struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #if defined(CONFIG_MEMFD_CREATE) int memfd_noexec_scope; #endif #endif } __randomize_layout; extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) refcount_inc(&ns->ns.count); return ns; } #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { int scope = MEMFD_NOEXEC_SCOPE_EXEC; for (; ns; ns = ns->parent) scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); return scope; } #else static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } #endif extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); #else /* !CONFIG_PID_NS */ #include <linux/err.h> static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { return ns; } static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) { return 0; } static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) ns = ERR_PTR(-EINVAL); return ns; } static inline void put_pid_ns(struct pid_namespace *ns) { } static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); int register_pidns_sysctls(struct pid_namespace *pidns); void unregister_pidns_sysctls(struct pid_namespace *pidns); static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) { return task_active_pid_ns(tsk) == &init_pid_ns; } #endif /* _LINUX_PID_NS_H */
34682 34721 7421 7419 105 4 11 83 12 1 36 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/kernel.h> #include <linux/nospec.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/wordpart.h> /* out-of-line parts */ #if !defined(INLINE_COPY_FROM_USER) || defined(CONFIG_RUST) unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) { return _inline_copy_from_user(to, from, n); } EXPORT_SYMBOL(_copy_from_user); #endif #if !defined(INLINE_COPY_TO_USER) || defined(CONFIG_RUST) unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n) { return _inline_copy_to_user(to, from, n); } EXPORT_SYMBOL(_copy_to_user); #endif /** * check_zeroed_user: check if a userspace buffer only contains zero bytes * @from: Source address, in userspace. * @size: Size of buffer. * * This is effectively shorthand for "memchr_inv(from, 0, size) == NULL" for * userspace addresses (and is more efficient because we don't care where the * first non-zero byte is). * * Returns: * * 0: There were non-zero bytes present in the buffer. * * 1: The buffer was full of zero bytes. * * -EFAULT: access to userspace failed. */ int check_zeroed_user(const void __user *from, size_t size) { unsigned long val; uintptr_t align = (uintptr_t) from % sizeof(unsigned long); if (unlikely(size == 0)) return 1; from -= align; size += align; if (!user_read_access_begin(from, size)) return -EFAULT; unsafe_get_user(val, (unsigned long __user *) from, err_fault); if (align) val &= ~aligned_byte_mask(align); while (size > sizeof(unsigned long)) { if (unlikely(val)) goto done; from += sizeof(unsigned long); size -= sizeof(unsigned long); unsafe_get_user(val, (unsigned long __user *) from, err_fault); } if (size < sizeof(unsigned long)) val &= aligned_byte_mask(size); done: user_read_access_end(); return (val == 0); err_fault: user_read_access_end(); return -EFAULT; } EXPORT_SYMBOL(check_zeroed_user);
1642 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0 /* * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> #include <linux/compiler.h> #include <linux/export.h> #include <trace/events/napi.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) static void dql_check_stall(struct dql *dql, unsigned short stall_thrs) { unsigned long now; if (!stall_thrs) return; now = jiffies; /* Check for a potential stall */ if (time_after_eq(now, dql->last_reap + stall_thrs)) { unsigned long hist_head, t, start, end; /* We are trying to detect a period of at least @stall_thrs * jiffies without any Tx completions, but during first half * of which some Tx was posted. */ dqs_again: hist_head = READ_ONCE(dql->history_head); /* pairs with smp_wmb() in dql_queued() */ smp_rmb(); /* Get the previous entry in the ring buffer, which is the * oldest sample. */ start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG; /* Advance start to continue from the last reap time */ if (time_before(start, dql->last_reap + 1)) start = dql->last_reap + 1; /* Newest sample we should have already seen a completion for */ end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1); /* Shrink the search space to [start, (now - start_thrs/2)] if * `end` is beyond the stall zone */ if (time_before(now, end + stall_thrs / 2)) end = now - stall_thrs / 2; /* Search for the queued time in [t, end] */ for (t = start; time_before_eq(t, end); t++) if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG), dql->history)) break; /* Variable t contains the time of the queue */ if (!time_before_eq(t, end)) goto no_stall; /* The ring buffer was modified in the meantime, retry */ if (hist_head != READ_ONCE(dql->history_head)) goto dqs_again; dql->stall_cnt++; dql->stall_max = max_t(unsigned short, dql->stall_max, now - t); trace_dql_stall_detected(dql->stall_thrs, now - t, dql->last_reap, dql->history_head, now, dql->history); } no_stall: dql->last_reap = now; } /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; unsigned int ovlimit, completed, num_queued; unsigned short stall_thrs; bool all_prev_completed; num_queued = READ_ONCE(dql->num_queued); /* Read stall_thrs in advance since it belongs to the same (first) * cache line as ->num_queued. This way, dql_check_stall() does not * need to touch the first cache line again later, reducing the window * of possible false sharing. */ stall_thrs = READ_ONCE(dql->stall_thrs); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; ovlimit = POSDIFF(num_queued - dql->num_completed, limit); inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { /* * Queue considered starved if: * - The queue was over-limit in the last interval, * and there is no more data in the queue. * OR * - The queue was over-limit in the previous interval and * when enqueuing it was possible that all queued data * had been consumed. This covers the case when queue * may have becomes starved between completion processing * running and next time enqueue was scheduled. * * When queue is starved increase the limit by the amount * of bytes both sent and completed in the last interval, * plus any previous over-limit. */ limit += POSDIFF(completed, dql->prev_num_queued) + dql->prev_ovlimit; dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } else if (inprogress && prev_inprogress && !all_prev_completed) { /* * Queue was not starved, check if the limit can be decreased. * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * * If there is slack, the amount of excess data queued above * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. */ unsigned int slack, slack_last_objs; /* * Slack is the maximum of * - The queue limit plus previous over-limit minus twice * the number of objects completed. Note that two times * number of completed bytes is a basis for an upper bound * of the limit. * - Portion of objects in the last queuing operation that * was not part of non-zero previous over-limit. That is * "round down" by non-overlimit portion of the last * queueing operation. */ slack = POSDIFF(limit + dql->prev_ovlimit, 2 * (completed - dql->num_completed)); slack_last_objs = dql->prev_ovlimit ? POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0; slack = max(slack, slack_last_objs); if (slack < dql->lowest_slack) dql->lowest_slack = slack; if (time_after(jiffies, dql->slack_start_time + dql->slack_hold_time)) { limit = POSDIFF(limit, dql->lowest_slack); dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } } /* Enforce bounds on limit */ limit = clamp(limit, dql->min_limit, dql->max_limit); if (limit != dql->limit) { dql->limit = limit; ovlimit = 0; } dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = READ_ONCE(dql->last_obj_cnt); dql->num_completed = completed; dql->prev_num_queued = num_queued; dql_check_stall(dql, stall_thrs); } EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ dql->limit = dql->min_limit; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; dql->prev_num_queued = 0; dql->prev_last_obj_cnt = 0; dql->prev_ovlimit = 0; dql->lowest_slack = UINT_MAX; dql->slack_start_time = jiffies; dql->last_reap = jiffies; dql->history_head = jiffies / BITS_PER_LONG; memset(dql->history, 0, sizeof(dql->history)); } EXPORT_SYMBOL(dql_reset); void dql_init(struct dql *dql, unsigned int hold_time) { dql->max_limit = DQL_MAX_LIMIT; dql->min_limit = 0; dql->slack_hold_time = hold_time; dql->stall_thrs = 0; dql_reset(dql); } EXPORT_SYMBOL(dql_init);
1 7 1 1 35 35 35 35 35 21 21 21 28 28 28 21 6 7 7 8 8 6 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2022, SUSE. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/spinlock.h> #include "protocol.h" static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; ssk = mptcp_subflow_get_send(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); return 0; } static struct mptcp_sched_ops mptcp_sched_default = { .get_send = mptcp_sched_default_get_send, .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; /* Must be called with rcu read lock held */ struct mptcp_sched_ops *mptcp_sched_find(const char *name) { struct mptcp_sched_ops *sched, *ret = NULL; list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { if (!strcmp(sched->name, name)) { ret = sched; break; } } return ret; } /* Build string with list of available scheduler values. * Similar to tcp_get_available_congestion_control() */ void mptcp_get_available_schedulers(char *buf, size_t maxlen) { struct mptcp_sched_ops *sched; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(sched, &mptcp_sched_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", sched->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { if (!sched->get_send) { pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; } return 0; } int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { int ret; ret = mptcp_validate_scheduler(sched); if (ret) return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { spin_unlock(&mptcp_sched_list_lock); return -EEXIST; } list_add_tail_rcu(&sched->list, &mptcp_sched_list); spin_unlock(&mptcp_sched_list_lock); pr_debug("%s registered\n", sched->name); return 0; } void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched) { if (sched == &mptcp_sched_default) return; spin_lock(&mptcp_sched_list_lock); list_del_rcu(&sched->list); spin_unlock(&mptcp_sched_list_lock); } void mptcp_sched_init(void) { mptcp_register_scheduler(&mptcp_sched_default); } int mptcp_init_sched(struct mptcp_sock *msk, struct mptcp_sched_ops *sched) { if (!sched) sched = &mptcp_sched_default; if (!bpf_try_module_get(sched, sched->owner)) return -EBUSY; msk->sched = sched; if (msk->sched->init) msk->sched->init(msk); pr_debug("sched=%s\n", msk->sched->name); return 0; } void mptcp_release_sched(struct mptcp_sock *msk) { struct mptcp_sched_ops *sched = msk->sched; if (!sched) return; msk->sched = NULL; if (sched->release) sched->release(msk); bpf_module_put(sched, sched->owner); } void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, bool scheduled) { WRITE_ONCE(subflow->scheduled, scheduled); } int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_send */ if (__mptcp_check_fallback(msk)) { if (msk->first && __tcp_can_send(msk->first) && sk_stream_memory_free(msk->first)) { mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true); return 0; } return -EINVAL; } mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_send(msk); return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); /* the following check is moved out of mptcp_subflow_get_retrans */ if (__mptcp_check_fallback(msk)) return -EINVAL; mptcp_for_each_subflow(msk, subflow) { if (READ_ONCE(subflow->scheduled)) return 0; } if (msk->sched == &mptcp_sched_default || !msk->sched) return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) return msk->sched->get_retrans(msk); return msk->sched->get_send(msk); }
17 17 17 17 17 17 17 17 17 17 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0 /* * linux/net/sunrpc/auth_unix.c * * UNIX-style authentication; no AUTH_SHORT support * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/mempool.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/user_namespace.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif static struct rpc_auth unix_auth; static const struct rpc_credops unix_credops; static mempool_t *unix_pool; static struct rpc_auth * unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { refcount_inc(&unix_auth.au_count); return &unix_auth; } static void unx_destroy(struct rpc_auth *auth) { } /* * Lookup AUTH_UNIX creds for current process */ static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { struct rpc_cred *ret; ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask()); if (!ret) { if (!(flags & RPCAUTH_LOOKUP_ASYNC)) return ERR_PTR(-ENOMEM); ret = mempool_alloc(unix_pool, GFP_NOWAIT); if (!ret) return ERR_PTR(-ENOMEM); } rpcauth_init_cred(ret, acred, auth, &unix_credops); ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; return ret; } static void unx_free_cred_callback(struct rcu_head *head) { struct rpc_cred *rpc_cred = container_of(head, struct rpc_cred, cr_rcu); put_cred(rpc_cred->cr_cred); mempool_free(rpc_cred, unix_pool); } static void unx_destroy_cred(struct rpc_cred *cred) { call_rcu(&cred->cr_rcu, unx_free_cred_callback); } /* * Match credentials against current the auth_cred. */ static int unx_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) { unsigned int groups = 0; unsigned int i; if (cred->cr_cred == acred->cred) return 1; if (!uid_eq(cred->cr_cred->fsuid, acred->cred->fsuid) || !gid_eq(cred->cr_cred->fsgid, acred->cred->fsgid)) return 0; if (acred->cred->group_info != NULL) groups = acred->cred->group_info->ngroups; if (groups > UNX_NGROUPS) groups = UNX_NGROUPS; if (cred->cr_cred->group_info == NULL) return groups == 0; if (groups != cred->cr_cred->group_info->ngroups) return 0; for (i = 0; i < groups ; i++) if (!gid_eq(cred->cr_cred->group_info->gid[i], acred->cred->group_info->gid[i])) return 0; return 1; } /* * Marshal credentials. * Maybe we should keep a cached credential for performance reasons. */ static int unx_marshal(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_clnt *clnt = task->tk_client; struct rpc_cred *cred = task->tk_rqstp->rq_cred; __be32 *p, *cred_len, *gidarr_len; int i; struct group_info *gi = cred->cr_cred->group_info; struct user_namespace *userns = clnt->cl_cred ? clnt->cl_cred->user_ns : &init_user_ns; /* Credential */ p = xdr_reserve_space(xdr, 3 * sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_unix; cred_len = p++; *p++ = xdr_zero; /* stamp */ if (xdr_stream_encode_opaque(xdr, clnt->cl_nodename, clnt->cl_nodelen) < 0) goto marshal_failed; p = xdr_reserve_space(xdr, 3 * sizeof(*p)); if (!p) goto marshal_failed; *p++ = cpu_to_be32(from_kuid_munged(userns, cred->cr_cred->fsuid)); *p++ = cpu_to_be32(from_kgid_munged(userns, cred->cr_cred->fsgid)); gidarr_len = p++; if (gi) for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++) *p++ = cpu_to_be32(from_kgid_munged(userns, gi->gid[i])); *gidarr_len = cpu_to_be32(p - gidarr_len - 1); *cred_len = cpu_to_be32((p - cred_len - 1) << 2); p = xdr_reserve_space(xdr, (p - gidarr_len - 1) << 2); if (!p) goto marshal_failed; /* Verifier */ p = xdr_reserve_space(xdr, 2 * sizeof(*p)); if (!p) goto marshal_failed; *p++ = rpc_auth_null; *p = xdr_zero; return 0; marshal_failed: return -EMSGSIZE; } /* * Refresh credentials. This is a no-op for AUTH_UNIX */ static int unx_refresh(struct rpc_task *task) { set_bit(RPCAUTH_CRED_UPTODATE, &task->tk_rqstp->rq_cred->cr_flags); return 0; } static int unx_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_auth *auth = task->tk_rqstp->rq_cred->cr_auth; __be32 *p; u32 size; p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) return -EIO; switch (*p++) { case rpc_auth_null: case rpc_auth_unix: case rpc_auth_short: break; default: return -EIO; } size = be32_to_cpup(p); if (size > RPC_MAX_AUTH_SIZE) return -EIO; p = xdr_inline_decode(xdr, size); if (!p) return -EIO; auth->au_verfsize = XDR_QUADLEN(size) + 2; auth->au_rslack = XDR_QUADLEN(size) + 2; auth->au_ralign = XDR_QUADLEN(size) + 2; return 0; } int __init rpc_init_authunix(void) { unix_pool = mempool_create_kmalloc_pool(16, sizeof(struct rpc_cred)); return unix_pool ? 0 : -ENOMEM; } void rpc_destroy_authunix(void) { mempool_destroy(unix_pool); } const struct rpc_authops authunix_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_UNIX, .au_name = "UNIX", .create = unx_create, .destroy = unx_destroy, .lookup_cred = unx_lookup_cred, }; static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, .au_verfsize = NUL_REPLYSLACK, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = REFCOUNT_INIT(1), }; static const struct rpc_credops unix_credops = { .cr_name = "AUTH_UNIX", .crdestroy = unx_destroy_cred, .crmatch = unx_match, .crmarshal = unx_marshal, .crwrap_req = rpcauth_wrap_req_encode, .crrefresh = unx_refresh, .crvalidate = unx_validate, .crunwrap_resp = rpcauth_unwrap_resp_decode, };
3794 2543 1277 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 /* SPDX-License-Identifier: GPL-2.0-only */ /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. */ #ifndef __AF_VSOCK_H__ #define __AF_VSOCK_H__ #include <linux/kernel.h> #include <linux/workqueue.h> #include <net/sock.h> #include <uapi/linux/vm_sockets.h> #include "vsock_addr.h" #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; extern spinlock_t vsock_table_lock; #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) struct vsock_sock { /* sk must be the first member. */ struct sock sk; const struct vsock_transport *transport; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ struct list_head bound_table; struct list_head connected_table; /* Accessed without the socket lock held. This means it can never be * modified outsided of socket create or destruct. */ bool trusted; bool cached_peer_allow_dgram; /* Dgram communication allowed to * cached peer? */ u32 cached_peer; /* Context ID of last dgram destination check. */ const struct cred *owner; /* Rest are SOCK_STREAM only. */ long connect_timeout; /* Listening socket that this came from. */ struct sock *listener; /* Used for pending list and accept queue during connection handshake. * The listening socket is the head for both lists. Sockets created * for connection requests are placed in the pending list until they * are connected, at which point they are put in the accept queue list * so they can be accepted in accept(). If accept() cannot accept the * connection, it is marked as rejected so the cleanup function knows * to clean up the socket. */ struct list_head pending_links; struct list_head accept_queue; bool rejected; struct delayed_work connect_work; struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; /* Protected by lock_sock(sk) */ u64 buffer_size; u64 buffer_min_size; u64 buffer_max_size; /* Private to transport. */ void *trans; }; s64 vsock_connectible_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ struct vsock_transport_recv_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ bool notify_on_block; }; struct vsock_transport_send_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ }; /* Transport features flags */ /* Transport provides host->guest communication */ #define VSOCK_TRANSPORT_F_H2G 0x00000001 /* Transport provides guest->host communication */ #define VSOCK_TRANSPORT_F_G2H 0x00000002 /* Transport provides DGRAM communication */ #define VSOCK_TRANSPORT_F_DGRAM 0x00000004 /* Transport provides local (loopback) communication */ #define VSOCK_TRANSPORT_F_LOCAL 0x00000008 struct vsock_transport { struct module *module; /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); /* Cancel all pending packets sent on vsock. */ int (*cancel_pkt)(struct vsock_sock *vsk); /* Connections. */ int (*connect)(struct vsock_sock *); /* DGRAM. */ int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); bool (*dgram_allow)(u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *, size_t len, int flags); ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *, size_t len); s64 (*stream_has_data)(struct vsock_sock *); s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); bool (*seqpacket_allow)(u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); int (*notify_recv_init)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_block)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, ssize_t, bool, struct vsock_transport_recv_notify_data *); int (*notify_send_init)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_block)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_enqueue)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* SIOCOUTQ ioctl */ ssize_t (*unsent_bytes)(struct vsock_sock *vsk); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Addressing. */ u32 (*get_local_cid)(void); /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); /* Zero-copy. */ bool (*msgzerocopy_allow)(void); }; /**** CORE ****/ int vsock_core_register(const struct vsock_transport *t, int features); void vsock_core_unregister(const struct vsock_transport *t); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk); /**** UTILS ****/ /* vsock_table_lock must be held */ static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) { return !list_empty(&vsk->bound_table); } /* vsock_table_lock must be held */ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) { return !list_empty(&vsk->connected_table); } void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); /**** TAP ****/ struct vsock_tap { struct net_device *dev; struct module *module; struct list_head list; }; int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); #ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else static inline void __init vsock_bpf_build_proto(void) {} #endif static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } #endif /* __AF_VSOCK_H__ */
44 43 16 2 1 1 12 11 11 1 10 2 50 50 42 5 5 3 50 56 56 40 17 2 12 5 56 23 28 27 28 101 46 57 57 26 19 16 42 22 57 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 // SPDX-License-Identifier: GPL-2.0 /* * queue_stack_maps.c: BPF queue and stack maps * * Copyright (c) 2018 Politecnico di Torino */ #include <linux/bpf.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/btf_ids.h> #include "percpu_freelist.h" #include <asm/rqspinlock.h> #define QUEUE_STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; rqspinlock_t lock; u32 head, tail; u32 size; /* max_entries + 1 */ char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) { return container_of(map, struct bpf_queue_stack, map); } static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) { return qs->head == qs->tail; } static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) { u32 head = qs->head + 1; if (unlikely(head >= qs->size)) head = 0; return head == qs->tail; } /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ return -E2BIG; return 0; } static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; u64 size, queue_size; size = (u64) attr->max_entries + 1; queue_size = sizeof(*qs) + size * attr->value_size; qs = bpf_map_area_alloc(queue_size, numa_node); if (!qs) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&qs->map, attr); qs->size = size; raw_res_spin_lock_init(&qs->lock); return &qs->map; } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void queue_stack_map_free(struct bpf_map *map) { struct bpf_queue_stack *qs = bpf_queue_stack(map); bpf_map_area_free(qs); } static long __queue_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; int err = 0; void *ptr; if (raw_res_spin_lock_irqsave(&qs->lock, flags)) return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } ptr = &qs->elements[qs->tail * qs->map.value_size]; memcpy(value, ptr, qs->map.value_size); if (delete) { if (unlikely(++qs->tail >= qs->size)) qs->tail = 0; } out: raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } static long __stack_map_get(struct bpf_map *map, void *value, bool delete) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long flags; int err = 0; void *ptr; u32 index; if (raw_res_spin_lock_irqsave(&qs->lock, flags)) return -EBUSY; if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); err = -ENOENT; goto out; } index = qs->head - 1; if (unlikely(index >= qs->size)) index = qs->size - 1; ptr = &qs->elements[index * qs->map.value_size]; memcpy(value, ptr, qs->map.value_size); if (delete) qs->head = index; out: raw_res_spin_unlock_irqrestore(&qs->lock, flags); return err; } /* Called from syscall or from eBPF program */ static long queue_map_peek_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, false); } /* Called from syscall or from eBPF program */ static long stack_map_peek_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, false); } /* Called from syscall or from eBPF program */ static long queue_map_pop_elem(struct bpf_map *map, void *value) { return __queue_map_get(map, value, true); } /* Called from syscall or from eBPF program */ static long stack_map_pop_elem(struct bpf_map *map, void *value) { return __stack_map_get(map, value, true); } /* Called from syscall or from eBPF program */ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, u64 flags) { struct bpf_queue_stack *qs = bpf_queue_stack(map); unsigned long irq_flags; int err = 0; void *dst; /* BPF_EXIST is used to force making room for a new element in case the * map is full */ bool replace = (flags & BPF_EXIST); /* Check supported flags for queue and stack maps */ if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; if (raw_res_spin_lock_irqsave(&qs->lock, irq_flags)) return -EBUSY; if (queue_stack_map_is_full(qs)) { if (!replace) { err = -E2BIG; goto out; } /* advance tail pointer to overwrite oldest element */ if (unlikely(++qs->tail >= qs->size)) qs->tail = 0; } dst = &qs->elements[qs->head * qs->map.value_size]; memcpy(dst, value, qs->map.value_size); if (unlikely(++qs->head >= qs->size)) qs->head = 0; out: raw_res_spin_unlock_irqrestore(&qs->lock, irq_flags); return err; } /* Called from syscall or from eBPF program */ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) { return NULL; } /* Called from syscall or from eBPF program */ static long queue_stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static long queue_stack_map_delete_elem(struct bpf_map *map, void *key) { return -EINVAL; } /* Called from syscall */ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -EINVAL; } static u64 queue_stack_map_mem_usage(const struct bpf_map *map) { u64 usage = sizeof(struct bpf_queue_stack); usage += ((u64)map->max_entries + 1) * map->value_size; return usage; } BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack) const struct bpf_map_ops queue_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, .map_pop_elem = queue_map_pop_elem, .map_peek_elem = queue_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], }; const struct bpf_map_ops stack_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = queue_stack_map_alloc_check, .map_alloc = queue_stack_map_alloc, .map_free = queue_stack_map_free, .map_lookup_elem = queue_stack_map_lookup_elem, .map_update_elem = queue_stack_map_update_elem, .map_delete_elem = queue_stack_map_delete_elem, .map_push_elem = queue_stack_map_push_elem, .map_pop_elem = stack_map_pop_elem, .map_peek_elem = stack_map_peek_elem, .map_get_next_key = queue_stack_map_get_next_key, .map_mem_usage = queue_stack_map_mem_usage, .map_btf_id = &queue_map_btf_ids[0], };
110 110 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright(c) 2004-2005 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/string.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/in.h> #include <linux/sysfs.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/nsproxy.h> #include <net/bonding.h> #define to_bond(cd) ((struct bonding *)(netdev_priv(to_net_dev(cd)))) /* "show" function for the bond_masters attribute. * The class parameter is ignored. */ static ssize_t bonding_show_bonds(const struct class *cls, const struct class_attribute *attr, char *buf) { const struct bond_net *bn = container_of_const(attr, struct bond_net, class_attr_bonding_masters); struct bonding *bond; int res = 0; rcu_read_lock(); list_for_each_entry_rcu(bond, &bn->dev_list, bond_list) { if (res > (PAGE_SIZE - IFNAMSIZ)) { /* not enough space for another interface name */ if ((PAGE_SIZE - res) > 10) res = PAGE_SIZE - 10; res += sysfs_emit_at(buf, res, "++more++ "); break; } res += sysfs_emit_at(buf, res, "%s ", bond->dev->name); } if (res) buf[res-1] = '\n'; /* eat the leftover space */ rcu_read_unlock(); return res; } static struct net_device *bond_get_by_name(const struct bond_net *bn, const char *ifname) { struct bonding *bond; list_for_each_entry(bond, &bn->dev_list, bond_list) { if (strncmp(bond->dev->name, ifname, IFNAMSIZ) == 0) return bond->dev; } return NULL; } /* "store" function for the bond_masters attribute. This is what * creates and deletes entire bonds. * * The class parameter is ignored. */ static ssize_t bonding_store_bonds(const struct class *cls, const struct class_attribute *attr, const char *buffer, size_t count) { const struct bond_net *bn = container_of_const(attr, struct bond_net, class_attr_bonding_masters); char command[IFNAMSIZ + 1] = {0, }; char *ifname; int rv, res = count; sscanf(buffer, "%16s", command); /* IFNAMSIZ*/ ifname = command + 1; if ((strlen(command) <= 1) || !dev_valid_name(ifname)) goto err_no_cmd; if (command[0] == '+') { pr_info("%s is being created...\n", ifname); rv = bond_create(bn->net, ifname); if (rv) { if (rv == -EEXIST) pr_info("%s already exists\n", ifname); else pr_info("%s creation failed\n", ifname); res = rv; } } else if (command[0] == '-') { struct net_device *bond_dev; rtnl_lock(); bond_dev = bond_get_by_name(bn, ifname); if (bond_dev) { pr_info("%s is being deleted...\n", ifname); unregister_netdevice(bond_dev); } else { pr_err("unable to delete non-existent %s\n", ifname); res = -ENODEV; } rtnl_unlock(); } else goto err_no_cmd; /* Always return either count or an error. If you return 0, you'll * get called forever, which is bad. */ return res; err_no_cmd: pr_err("no command found in bonding_masters - use +ifname or -ifname\n"); return -EPERM; } /* class attribute for bond_masters file. This ends up in /sys/class/net */ static const struct class_attribute class_attr_bonding_masters = { .attr = { .name = "bonding_masters", .mode = 0644, }, .show = bonding_show_bonds, .store = bonding_store_bonds, }; /* Generic "store" method for bonding sysfs option setting */ static ssize_t bonding_sysfs_store_option(struct device *d, struct device_attribute *attr, const char *buffer, size_t count) { struct bonding *bond = to_bond(d); const struct bond_option *opt; char *buffer_clone; int ret; opt = bond_opt_get_by_name(attr->attr.name); if (WARN_ON(!opt)) return -ENOENT; buffer_clone = kstrndup(buffer, count, GFP_KERNEL); if (!buffer_clone) return -ENOMEM; ret = bond_opt_tryset_rtnl(bond, opt->id, buffer_clone); if (!ret) ret = count; kfree(buffer_clone); return ret; } /* Show the slaves in the current bond. */ static ssize_t bonding_show_slaves(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); struct list_head *iter; struct slave *slave; int res = 0; rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { if (res > (PAGE_SIZE - IFNAMSIZ)) { /* not enough space for another interface name */ if ((PAGE_SIZE - res) > 10) res = PAGE_SIZE - 10; res += sysfs_emit_at(buf, res, "++more++ "); break; } res += sysfs_emit_at(buf, res, "%s ", slave->dev->name); } rcu_read_unlock(); if (res) buf[res-1] = '\n'; /* eat the leftover space */ return res; } static DEVICE_ATTR(slaves, 0644, bonding_show_slaves, bonding_sysfs_store_option); /* Show the bonding mode. */ static ssize_t bonding_show_mode(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_MODE, BOND_MODE(bond)); return sysfs_emit(buf, "%s %d\n", val->string, BOND_MODE(bond)); } static DEVICE_ATTR(mode, 0644, bonding_show_mode, bonding_sysfs_store_option); /* Show the bonding transmit hash method. */ static ssize_t bonding_show_xmit_hash(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.xmit_policy); } static DEVICE_ATTR(xmit_hash_policy, 0644, bonding_show_xmit_hash, bonding_sysfs_store_option); /* Show arp_validate. */ static ssize_t bonding_show_arp_validate(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_ARP_VALIDATE, bond->params.arp_validate); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.arp_validate); } static DEVICE_ATTR(arp_validate, 0644, bonding_show_arp_validate, bonding_sysfs_store_option); /* Show arp_all_targets. */ static ssize_t bonding_show_arp_all_targets(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_ARP_ALL_TARGETS, bond->params.arp_all_targets); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.arp_all_targets); } static DEVICE_ATTR(arp_all_targets, 0644, bonding_show_arp_all_targets, bonding_sysfs_store_option); /* Show fail_over_mac. */ static ssize_t bonding_show_fail_over_mac(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.fail_over_mac); } static DEVICE_ATTR(fail_over_mac, 0644, bonding_show_fail_over_mac, bonding_sysfs_store_option); /* Show the arp timer interval. */ static ssize_t bonding_show_arp_interval(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.arp_interval); } static DEVICE_ATTR(arp_interval, 0644, bonding_show_arp_interval, bonding_sysfs_store_option); /* Show the arp targets. */ static ssize_t bonding_show_arp_targets(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); int i, res = 0; for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) { if (bond->params.arp_targets[i]) res += sysfs_emit_at(buf, res, "%pI4 ", &bond->params.arp_targets[i]); } if (res) buf[res-1] = '\n'; /* eat the leftover space */ return res; } static DEVICE_ATTR(arp_ip_target, 0644, bonding_show_arp_targets, bonding_sysfs_store_option); /* Show the arp missed max. */ static ssize_t bonding_show_missed_max(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%u\n", bond->params.missed_max); } static DEVICE_ATTR(arp_missed_max, 0644, bonding_show_missed_max, bonding_sysfs_store_option); /* Show the up and down delays. */ static ssize_t bonding_show_downdelay(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.downdelay * bond->params.miimon); } static DEVICE_ATTR(downdelay, 0644, bonding_show_downdelay, bonding_sysfs_store_option); static ssize_t bonding_show_updelay(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.updelay * bond->params.miimon); } static DEVICE_ATTR(updelay, 0644, bonding_show_updelay, bonding_sysfs_store_option); static ssize_t bonding_show_peer_notif_delay(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.peer_notif_delay * bond->params.miimon); } static DEVICE_ATTR(peer_notif_delay, 0644, bonding_show_peer_notif_delay, bonding_sysfs_store_option); /* Show the LACP activity and interval. */ static ssize_t bonding_show_lacp_active(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_LACP_ACTIVE, bond->params.lacp_active); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.lacp_active); } static DEVICE_ATTR(lacp_active, 0644, bonding_show_lacp_active, bonding_sysfs_store_option); static ssize_t bonding_show_lacp_rate(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_LACP_RATE, bond->params.lacp_fast); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.lacp_fast); } static DEVICE_ATTR(lacp_rate, 0644, bonding_show_lacp_rate, bonding_sysfs_store_option); static ssize_t bonding_show_min_links(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%u\n", bond->params.min_links); } static DEVICE_ATTR(min_links, 0644, bonding_show_min_links, bonding_sysfs_store_option); static ssize_t bonding_show_ad_select(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.ad_select); } static DEVICE_ATTR(ad_select, 0644, bonding_show_ad_select, bonding_sysfs_store_option); /* Show the number of peer notifications to send after a failover event. */ static ssize_t bonding_show_num_peer_notif(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.num_peer_notif); } static DEVICE_ATTR(num_grat_arp, 0644, bonding_show_num_peer_notif, bonding_sysfs_store_option); static DEVICE_ATTR(num_unsol_na, 0644, bonding_show_num_peer_notif, bonding_sysfs_store_option); /* Show the MII monitor interval. */ static ssize_t bonding_show_miimon(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.miimon); } static DEVICE_ATTR(miimon, 0644, bonding_show_miimon, bonding_sysfs_store_option); /* Show the primary slave. */ static ssize_t bonding_show_primary(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); struct slave *primary; int count = 0; rcu_read_lock(); primary = rcu_dereference(bond->primary_slave); if (primary) count = sysfs_emit(buf, "%s\n", primary->dev->name); rcu_read_unlock(); return count; } static DEVICE_ATTR(primary, 0644, bonding_show_primary, bonding_sysfs_store_option); /* Show the primary_reselect flag. */ static ssize_t bonding_show_primary_reselect(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); const struct bond_opt_value *val; val = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); return sysfs_emit(buf, "%s %d\n", val->string, bond->params.primary_reselect); } static DEVICE_ATTR(primary_reselect, 0644, bonding_show_primary_reselect, bonding_sysfs_store_option); /* Show the use_carrier flag. */ static ssize_t bonding_show_carrier(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.use_carrier); } static DEVICE_ATTR(use_carrier, 0644, bonding_show_carrier, bonding_sysfs_store_option); /* Show currently active_slave. */ static ssize_t bonding_show_active_slave(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); struct net_device *slave_dev; int count = 0; rcu_read_lock(); slave_dev = bond_option_active_slave_get_rcu(bond); if (slave_dev) count = sysfs_emit(buf, "%s\n", slave_dev->name); rcu_read_unlock(); return count; } static DEVICE_ATTR(active_slave, 0644, bonding_show_active_slave, bonding_sysfs_store_option); /* Show link status of the bond interface. */ static ssize_t bonding_show_mii_status(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); bool active = netif_carrier_ok(bond->dev); return sysfs_emit(buf, "%s\n", active ? "up" : "down"); } static DEVICE_ATTR(mii_status, 0444, bonding_show_mii_status, NULL); /* Show current 802.3ad aggregator ID. */ static ssize_t bonding_show_ad_aggregator(struct device *d, struct device_attribute *attr, char *buf) { int count = 0; struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; count = sysfs_emit(buf, "%d\n", bond_3ad_get_active_agg_info(bond, &ad_info) ? 0 : ad_info.aggregator_id); } return count; } static DEVICE_ATTR(ad_aggregator, 0444, bonding_show_ad_aggregator, NULL); /* Show number of active 802.3ad ports. */ static ssize_t bonding_show_ad_num_ports(struct device *d, struct device_attribute *attr, char *buf) { int count = 0; struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; count = sysfs_emit(buf, "%d\n", bond_3ad_get_active_agg_info(bond, &ad_info) ? 0 : ad_info.ports); } return count; } static DEVICE_ATTR(ad_num_ports, 0444, bonding_show_ad_num_ports, NULL); /* Show current 802.3ad actor key. */ static ssize_t bonding_show_ad_actor_key(struct device *d, struct device_attribute *attr, char *buf) { int count = 0; struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) { struct ad_info ad_info; count = sysfs_emit(buf, "%d\n", bond_3ad_get_active_agg_info(bond, &ad_info) ? 0 : ad_info.actor_key); } return count; } static DEVICE_ATTR(ad_actor_key, 0444, bonding_show_ad_actor_key, NULL); /* Show current 802.3ad partner key. */ static ssize_t bonding_show_ad_partner_key(struct device *d, struct device_attribute *attr, char *buf) { int count = 0; struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) { struct ad_info ad_info; count = sysfs_emit(buf, "%d\n", bond_3ad_get_active_agg_info(bond, &ad_info) ? 0 : ad_info.partner_key); } return count; } static DEVICE_ATTR(ad_partner_key, 0444, bonding_show_ad_partner_key, NULL); /* Show current 802.3ad partner mac. */ static ssize_t bonding_show_ad_partner_mac(struct device *d, struct device_attribute *attr, char *buf) { int count = 0; struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) { struct ad_info ad_info; if (!bond_3ad_get_active_agg_info(bond, &ad_info)) count = sysfs_emit(buf, "%pM\n", ad_info.partner_system); } return count; } static DEVICE_ATTR(ad_partner_mac, 0444, bonding_show_ad_partner_mac, NULL); /* Show the queue_ids of the slaves in the current bond. */ static ssize_t bonding_show_queue_id(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); struct list_head *iter; struct slave *slave; int res = 0; rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { if (res > (PAGE_SIZE - IFNAMSIZ - 6)) { /* not enough space for another interface_name:queue_id pair */ if ((PAGE_SIZE - res) > 10) res = PAGE_SIZE - 10; res += sysfs_emit_at(buf, res, "++more++ "); break; } res += sysfs_emit_at(buf, res, "%s:%d ", slave->dev->name, READ_ONCE(slave->queue_id)); } if (res) buf[res-1] = '\n'; /* eat the leftover space */ rcu_read_unlock(); return res; } static DEVICE_ATTR(queue_id, 0644, bonding_show_queue_id, bonding_sysfs_store_option); /* Show the all_slaves_active flag. */ static ssize_t bonding_show_slaves_active(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.all_slaves_active); } static DEVICE_ATTR(all_slaves_active, 0644, bonding_show_slaves_active, bonding_sysfs_store_option); /* Show the number of IGMP membership reports to send on link failure */ static ssize_t bonding_show_resend_igmp(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.resend_igmp); } static DEVICE_ATTR(resend_igmp, 0644, bonding_show_resend_igmp, bonding_sysfs_store_option); static ssize_t bonding_show_lp_interval(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.lp_interval); } static DEVICE_ATTR(lp_interval, 0644, bonding_show_lp_interval, bonding_sysfs_store_option); static ssize_t bonding_show_tlb_dynamic_lb(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); return sysfs_emit(buf, "%d\n", bond->params.tlb_dynamic_lb); } static DEVICE_ATTR(tlb_dynamic_lb, 0644, bonding_show_tlb_dynamic_lb, bonding_sysfs_store_option); static ssize_t bonding_show_packets_per_slave(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); unsigned int packets_per_slave = bond->params.packets_per_slave; return sysfs_emit(buf, "%u\n", packets_per_slave); } static DEVICE_ATTR(packets_per_slave, 0644, bonding_show_packets_per_slave, bonding_sysfs_store_option); static ssize_t bonding_show_ad_actor_sys_prio(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) return sysfs_emit(buf, "%hu\n", bond->params.ad_actor_sys_prio); return 0; } static DEVICE_ATTR(ad_actor_sys_prio, 0644, bonding_show_ad_actor_sys_prio, bonding_sysfs_store_option); static ssize_t bonding_show_ad_actor_system(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) return sysfs_emit(buf, "%pM\n", bond->params.ad_actor_system); return 0; } static DEVICE_ATTR(ad_actor_system, 0644, bonding_show_ad_actor_system, bonding_sysfs_store_option); static ssize_t bonding_show_ad_user_port_key(struct device *d, struct device_attribute *attr, char *buf) { struct bonding *bond = to_bond(d); if (BOND_MODE(bond) == BOND_MODE_8023AD && capable(CAP_NET_ADMIN)) return sysfs_emit(buf, "%hu\n", bond->params.ad_user_port_key); return 0; } static DEVICE_ATTR(ad_user_port_key, 0644, bonding_show_ad_user_port_key, bonding_sysfs_store_option); static struct attribute *per_bond_attrs[] = { &dev_attr_slaves.attr, &dev_attr_mode.attr, &dev_attr_fail_over_mac.attr, &dev_attr_arp_validate.attr, &dev_attr_arp_all_targets.attr, &dev_attr_arp_interval.attr, &dev_attr_arp_ip_target.attr, &dev_attr_downdelay.attr, &dev_attr_updelay.attr, &dev_attr_peer_notif_delay.attr, &dev_attr_lacp_active.attr, &dev_attr_lacp_rate.attr, &dev_attr_ad_select.attr, &dev_attr_xmit_hash_policy.attr, &dev_attr_num_grat_arp.attr, &dev_attr_num_unsol_na.attr, &dev_attr_miimon.attr, &dev_attr_primary.attr, &dev_attr_primary_reselect.attr, &dev_attr_use_carrier.attr, &dev_attr_active_slave.attr, &dev_attr_mii_status.attr, &dev_attr_ad_aggregator.attr, &dev_attr_ad_num_ports.attr, &dev_attr_ad_actor_key.attr, &dev_attr_ad_partner_key.attr, &dev_attr_ad_partner_mac.attr, &dev_attr_queue_id.attr, &dev_attr_all_slaves_active.attr, &dev_attr_resend_igmp.attr, &dev_attr_min_links.attr, &dev_attr_lp_interval.attr, &dev_attr_packets_per_slave.attr, &dev_attr_tlb_dynamic_lb.attr, &dev_attr_ad_actor_sys_prio.attr, &dev_attr_ad_actor_system.attr, &dev_attr_ad_user_port_key.attr, &dev_attr_arp_missed_max.attr, NULL, }; static const struct attribute_group bonding_group = { .name = "bonding", .attrs = per_bond_attrs, }; /* Initialize sysfs. This sets up the bonding_masters file in * /sys/class/net. */ int __net_init bond_create_sysfs(struct bond_net *bn) { int ret; bn->class_attr_bonding_masters = class_attr_bonding_masters; sysfs_attr_init(&bn->class_attr_bonding_masters.attr); ret = netdev_class_create_file_ns(&bn->class_attr_bonding_masters, bn->net); /* Permit multiple loads of the module by ignoring failures to * create the bonding_masters sysfs file. Bonding devices * created by second or subsequent loads of the module will * not be listed in, or controllable by, bonding_masters, but * will have the usual "bonding" sysfs directory. * * This is done to preserve backwards compatibility for * initscripts/sysconfig, which load bonding multiple times to * configure multiple bonding devices. */ if (ret == -EEXIST) { /* Is someone being kinky and naming a device bonding_master? */ if (netdev_name_in_use(bn->net, class_attr_bonding_masters.attr.name)) pr_err("network device named %s already exists in sysfs\n", class_attr_bonding_masters.attr.name); ret = 0; } return ret; } /* Remove /sys/class/net/bonding_masters. */ void __net_exit bond_destroy_sysfs(struct bond_net *bn) { netdev_class_remove_file_ns(&bn->class_attr_bonding_masters, bn->net); } /* Initialize sysfs for each bond. This sets up and registers * the 'bondctl' directory for each individual bond under /sys/class/net. */ void bond_prepare_sysfs_group(struct bonding *bond) { bond->dev->sysfs_groups[0] = &bonding_group; }
10 10 1 10 10 10 10 10 10 105 81 81 71 10 81 17 14 3 17 17 1 17 9 74 14 73 4 31 3 33 13 92 92 92 6 2 17 41 36 9 9 9 60 48 40 25 1 30 39 9 21 10 3 19 7 7 1 7 7 7 2 2 44 44 203 239 22 68 27 16 16 48 48 17 17 31 48 48 46 40 6 4 2 44 44 4 47 3 3 2 2 2 2 1 2 3 76 6 5 9 59 5 2 3 3 3 5 58 57 46 41 41 41 11 50 3 1 3 3 3 3 3 3 1 3 3 1 2 1 3 3 3 3 56 3 56 1 52 2 1 1 54 54 55 61 61 4 55 1 59 179 8 172 18 163 179 157 19 13 10 7 3 2 1 43 60 51 1 1 1 1 49 37 9 2 3 7 43 18 3 1 1 1 32 8 3 30 33 3 410 28 28 8 181 14 158 171 171 171 171 161 10 55 43 11 51 1 42 8 24 26 22 29 46 4 48 2 48 2 46 4 40 10 40 11 47 2 51 51 46 5 10 39 33 8 4 1 2 1 2 7 1 6 258 293 5 6 4 3 2 3 1 8 3 1 4 283 37 405 98 98 98 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 tunneling device * Linux INET6 implementation * * Authors: * Ville Nuorvala <vnuorval@tcs.hut.fi> * Yasuyuki Kozakai <kozakai@linux-ipv6.org> * * Based on: * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c * * RFC 2473 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/etherdevice.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> MODULE_AUTHOR("Ville Nuorvala"); MODULE_DESCRIPTION("IPv6 tunneling device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ip6tnl"); MODULE_ALIAS_NETDEV("ip6tnl0"); #define IP6_TUNNEL_HASH_SIZE_SHIFT 5 #define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); } static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static struct rtnl_link_ops ip6_link_ops __read_mostly; static unsigned int ip6_tnl_net_id __read_mostly; struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; struct ip6_tnl __rcu *collect_md_tun; }; static inline int ip6_tnl_mpls_supported(void) { return IS_ENABLED(CONFIG_MPLS); } #define for_each_ip6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * ip6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @link: ifindex of underlying interface * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * ip6_tnl_lookup(struct net *net, int link, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t, *cand = NULL; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct in6_addr any; for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_equal(remote, &t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else cand = t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(local, &t->parms.laddr) || !ipv6_addr_any(&t->parms.raddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } hash = HASH(remote, &any); for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (!ipv6_addr_equal(remote, &t->parms.raddr) || !ipv6_addr_any(&t->parms.laddr) || !(t->dev->flags & IFF_UP)) continue; if (link == t->parms.link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(ip6n->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * ip6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * ip6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } /** * ip6_tnl_link - add tunnel to hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be added **/ static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, t); rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } /** * ip6_tnl_unlink - remove tunnel from hash table * @ip6n: the private data for ip6_vti in the netns * @t: tunnel to be removed **/ static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; if (t->parms.collect_md) rcu_assign_pointer(ip6n->collect_md_tun, NULL); for (tp = ip6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); } static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); ip6_tnl_link(ip6n, t); return 0; out: return err; } /** * ip6_tnl_create - create a new tunnel * @net: network namespace * @p: tunnel parameters * * Description: * Create tunnel matching given parameters. * * Return: * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err = -E2BIG; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6tnl%%d"); } err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = ip6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } /** * ip6_tnl_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * ip6_tnl_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); for (tp = ip6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && p->link == t->parms.link) { if (create) return ERR_PTR(-EEXIST); return t; } } if (!create) return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } /** * ip6_tnl_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * ip6_tnl_dev_uninit() removes tunnel from its list **/ static void ip6_tnl_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else ip6_tnl_unlink(ip6n, t); dst_cache_reset(&t->dst_cache); netdev_put(dev, &t->dev_tracker); } /** * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option * @skb: received socket buffer * @raw: the ICMPv6 error message data * * Return: * 0 if none was found, * else index to encapsulation limit **/ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; unsigned int nhoff = raw - skb->data; unsigned int off = nhoff + sizeof(*ipv6h); u8 nexthdr = ipv6h->nexthdr; while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { struct ipv6_opt_hdr *hdr; u16 optlen; if (!pskb_may_pull(skb, off + sizeof(*hdr))) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { optlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { optlen = ipv6_authlen(hdr); } else { optlen = ipv6_optlen(hdr); } if (!pskb_may_pull(skb, off + optlen)) break; hdr = (struct ipv6_opt_hdr *)(skb->data + off); if (nexthdr == NEXTHDR_FRAGMENT) { struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr; if (frag_hdr->frag_off) break; } if (nexthdr == NEXTHDR_DEST) { u16 i = 2; while (1) { struct ipv6_tlv_tnl_enc_lim *tel; /* No more room for encapsulation limit */ if (i + sizeof(*tel) > optlen) break; tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); /* return index of option if found and valid */ if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && tel->length == 1) return i + off - nhoff; /* else jump to next option */ if (tel->type) i += tel->length + 2; else i++; } } nexthdr = hdr->nexthdr; off += optlen; } return 0; } EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim); /* ip6_tnl_err() should handle errors in the tunnel according to the * specifications in RFC 2473. */ static int ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt, u8 *type, u8 *code, int *msg, __u32 *info, int offset) { const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); u8 rel_type = ICMPV6_DEST_UNREACH; u8 rel_code = ICMPV6_ADDR_UNREACH; __u32 rel_info = 0; struct ip6_tnl *t; int err = -ENOENT; int rel_msg = 0; u8 tproto; __u16 len; /* If the packet doesn't contain the original IPv6 header we are in trouble since we might need the source address for further processing of the error. */ rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr); if (!t) goto out; tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto out; err = 0; switch (*type) { case ICMPV6_DEST_UNREACH: net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", t->parms.name); rel_msg = 1; break; case ICMPV6_TIME_EXCEED: if ((*code) == ICMPV6_EXC_HOPLIMIT) { net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } break; case ICMPV6_PARAMPROB: { struct ipv6_tlv_tnl_enc_lim *tel; __u32 teli; teli = 0; if ((*code) == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); if (teli && teli == *info - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", t->parms.name); rel_msg = 1; } } else { net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", t->parms.name); } break; } case ICMPV6_PKT_TOOBIG: { __u32 mtu; ip6_update_pmtu(skb, net, htonl(*info), 0, 0, sock_net_uid(net, NULL)); mtu = *info - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; len = sizeof(*ipv6h) + ntohs(ipv6h->payload_len); if (len > mtu) { rel_type = ICMPV6_PKT_TOOBIG; rel_code = 0; rel_info = mtu; rel_msg = 1; } break; } case NDISC_REDIRECT: ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); break; } *type = rel_type; *code = rel_code; *info = rel_info; *msg = rel_msg; out: rcu_read_unlock(); return err; } static int ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); const struct iphdr *eiph; struct sk_buff *skb2; int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; struct rtable *rt; struct flowi4 fl4; err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg == 0) return 0; switch (rel_type) { case ICMPV6_DEST_UNREACH: if (rel_code != ICMPV6_ADDR_UNREACH) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_HOST_UNREACH; break; case ICMPV6_PKT_TOOBIG: if (rel_code != 0) return 0; rel_type = ICMP_DEST_UNREACH; rel_code = ICMP_FRAG_NEEDED; break; default: return 0; } if (!pskb_may_pull(skb, offset + sizeof(struct iphdr))) return 0; skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); eiph = ip_hdr(skb2); /* Try to guess incoming interface */ rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr, 0, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt)) goto out; skb2->dev = rt->dst.dev; ip_rt_put(rt); /* route "incoming" packet */ if (rt->rt_flags & RTCF_LOCAL) { rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->daddr, eiph->saddr, 0, 0, IPPROTO_IPIP, eiph->tos & INET_DSCP_MASK, 0); if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) { if (!IS_ERR(rt)) ip_rt_put(rt); goto out; } skb_dst_set(skb2, &rt->dst); } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) goto out; } /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { if (rel_info > dst_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); } icmp_send(skb2, rel_type, rel_code, htonl(rel_info)); out: kfree_skb(skb2); return 0; } static int ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); if (err < 0) return err; if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) { struct rt6_info *rt; struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) return 0; skb_dst_drop(skb2); skb_pull(skb2, offset); skb_reset_network_header(skb2); /* Try to guess incoming interface */ rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb2, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; icmpv6_send(skb2, rel_type, rel_code, rel_info); ip6_rt_put(rt); kfree_skb(skb2); } return 0; } static int mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __u32 rel_info = ntohl(info); int err, rel_msg = 0; u8 rel_type = type; u8 rel_code = code; err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, &rel_msg, &rel_info, offset); return err; } static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); return IP6_ECN_decapsulate(ipv6h, skb); } static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); return IP6_ECN_decapsulate(ipv6h, skb); } static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) { /* ECN is not supported in AF_MPLS */ return 0; } __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ltype = ipv6_addr_type(laddr); int rtype = ipv6_addr_type(raddr); __u32 flags = 0; if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { flags = IP6_TNL_F_CAP_PER_PACKET; } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { if (ltype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_XMIT; if (rtype&IPV6_ADDR_UNICAST) flags |= IP6_TNL_F_CAP_RCV; } return flags; } EXPORT_SYMBOL(ip6_tnl_get_cap); /* called with rcu_read_lock() */ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if ((p->flags & IP6_TNL_F_CAP_RCV) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { struct net_device *ldev = NULL; if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if ((ipv6_addr_is_multicast(laddr) || likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) && ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE)))) ret = 1; } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb), bool log_ecn_err) { const struct ipv6hdr *ipv6h; int nh, err; if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) != test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) { if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb->protocol = tpi->proto; /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; skb_reset_mac_header(skb); } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(tunnel->dev, rx_length_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } /* Get the outer header. */ ipv6h = (struct ipv6hdr *)(skb->head + nh); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); if (unlikely(err)) { if (log_ecn_err) net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_err) { int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb); dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate; if (tpi->proto == htons(ETH_P_IP)) dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate; return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_err); } EXPORT_SYMBOL(ip6_tnl_rcv); static const struct tnl_ptk_info tpi_v6 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IPV6), }; static const struct tnl_ptk_info tpi_v4 = { /* no tunnel info required for ipxip6. */ .proto = htons(ETH_P_IP), }; static const struct tnl_ptk_info tpi_mpls = { /* no tunnel info required for mplsip6. */ .proto = htons(ETH_P_MPLS_UC), }; static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb)) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct metadata_dst *tun_dst = NULL; int ret = -1; rcu_read_lock(); t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr); if (t) { u8 tproto = READ_ONCE(t->parms.proto); if (tproto != ipproto && tproto != 0) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) goto drop; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; if (t->parms.collect_md) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0); if (!tun_dst) goto drop; } ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, log_ecn_error); } rcu_read_unlock(); return ret; drop: rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, ip6ip6_dscp_ecn_decapsulate); } static int mplsip6_rcv(struct sk_buff *skb) { return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, mplsip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; }; static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) { memset(opt, 0, sizeof(struct ipv6_tel_txoption)); opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; opt->dst_opt[3] = 1; opt->dst_opt[4] = encap_limit; opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } /** * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) { struct __ip6_tnl_parm *p = &t->parms; int ret = 0; struct net *net = t->net; if (t->parms.collect_md) return 1; if ((p->flags & IP6_TNL_F_CAP_XMIT) || ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { struct net_device *ldev = NULL; rcu_read_lock(); if (p->link) ldev = dev_get_by_index_rcu(net, p->link); if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", p->name); else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && !ipv6_addr_is_multicast(raddr) && unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, true, 0, IFA_F_TENTATIVE))) pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", p->name); else ret = 1; rcu_read_unlock(); } return ret; } EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending * it. * * Return: * 0 on success * -1 fail * %-EMSGSIZE message too big. return mtu in this case. **/ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct flowi6 *fl6, int encap_limit, __u32 *pmtu, __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; unsigned int max_headroom = psh_hlen; __be16 payload_protocol; bool use_cache = false; u8 hop_limit; int err = -1; payload_protocol = skb_protocol(skb, true); if (t->parms.collect_md) { hop_limit = skb_tunnel_info(skb)->key.ttl; goto route_lookup; } else { hop_limit = t->parms.hop_limit; } /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { if (payload_protocol == htons(ETH_P_IPV6)) { struct in6_addr *addr6; struct neighbour *neigh; int addr_type; if (!skb_dst(skb)) goto tx_err_link_failure; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_err_link_failure; addr6 = (struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) addr6 = &ipv6_hdr(skb)->daddr; memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); neigh_release(neigh); } else if (payload_protocol == htons(ETH_P_IP)) { const struct rtable *rt = skb_rtable(skb); if (!rt) goto tx_err_link_failure; if (rt->rt_gw_family == AF_INET6) memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr)); } } else if (t->parms.proto != 0 && !(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only if neither the outer protocol nor the * routing decision depends on the current inner header value */ use_cache = true; } if (use_cache) dst = dst_cache_get(&t->dst_cache); if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) goto tx_err_link_failure; if (!dst) { route_lookup: /* add dsfield to flowlabel for route lookup */ fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); dst = ip6_route_output(net, NULL, fl6); if (dst->error) goto tx_err_link_failure; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (t->parms.collect_md && ipv6_addr_any(&fl6->saddr) && ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, &fl6->daddr, 0, &fl6->saddr)) goto tx_err_link_failure; ndst = dst; } tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; } mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ? IPV6_MIN_MTU : IPV4_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } if (t->err_count > 0) { if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) { t->err_count--; dst_link_failure(skb); } else { t->err_count = 0; } } skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom += LL_RESERVED_SPACE(tdev); if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb; new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) goto tx_err_dst_release; if (skb->sk) skb_set_owner_w(new_skb, skb->sk); consume_skb(skb); skb = new_skb; } if (t->parms.collect_md) { if (t->encap.type != TUNNEL_ENCAP_NONE) goto tx_err_dst_release; } else { if (use_cache && ndst) dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); } skb_dst_set(skb, dst); if (hop_limit == 0) { if (payload_protocol == htons(ETH_P_IP)) hop_limit = ip_hdr(skb)->ttl; else if (payload_protocol == htons(ETH_P_IPV6)) hop_limit = ipv6_hdr(skb)->hop_limit; else hop_limit = ip6_dst_hoplimit(dst); } /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) return err; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_frag_opts(skb, &opt.ops, &proto); } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, dsfield, ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; __u16 offset; struct flowi6 fl6; __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; tproto = READ_ONCE(t->parms.proto); if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET6)) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); break; default: orig_dsfield = dsfield; break; } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; if (protocol == IPPROTO_IPV6) { offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have * reallocated skb->head */ if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel; tel = (void *)&skb_network_header(skb)[offset]; if (tel->encap_limit == 0) { icmpv6_ndo_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD, offset + 2); return -1; } encap_limit = tel->encap_limit - 1; } } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = protocol; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; switch (protocol) { case IPPROTO_IPIP: iph = ip_hdr(skb); orig_dsfield = ipv4_get_dsfield(iph); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); break; case IPPROTO_IPV6: ipv6h = ipv6_hdr(skb); orig_dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) dsfield = orig_dsfield; else dsfield = ip6_tclass(t->parms.flowinfo); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) fl6.flowlabel |= ip6_flowlabel(ipv6h); break; default: orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); break; } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; skb_set_inner_ipproto(skb, protocol); err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) switch (protocol) { case IPPROTO_IPIP: icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); break; case IPPROTO_IPV6: icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); break; default: break; } return -1; } return 0; } static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; ipproto = IPPROTO_IPV6; break; case htons(ETH_P_MPLS_UC): ipproto = IPPROTO_MPLS; break; default: goto tx_err; } ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static void ip6_tnl_link_config(struct ip6_tnl *t) { struct net_device *dev = t->dev; struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; int t_hlen; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ fl6->saddr = p->laddr; fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) { tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) { dev->needed_headroom = tdev->hard_header_len + tdev->needed_headroom + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) mtu -= 8; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); } } } /** * ip6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * * Description: * ip6_tnl_change() updates the tunnel parameters **/ static void ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.flags = p->flags; t->parms.hop_limit = p->hop_limit; t->parms.encap_limit = p->encap_limit; t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); } static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); ip6_tnl_unlink(ip6n, t); synchronize_net(); ip6_tnl_change(t, p); ip6_tnl_link(ip6n, t); netdev_state_change(t->dev); } static void ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) { /* for default tnl0 device allow to change only the proto */ t->parms.proto = p->proto; netdev_state_change(t->dev); } static void ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->flags = u->flags; p->hop_limit = u->hop_limit; p->encap_limit = u->encap_limit; p->flowinfo = u->flowinfo; p->link = u->link; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->flags = p->flags; u->hop_limit = p->hop_limit; u->encap_limit = p->encap_limit; u->flowinfo = p->flowinfo; u->link = p->link; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * ip6_tnl_ioctl() is used for managing IPv6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6tnl0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && p.proto != 0) break; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); if (dev == ip6n->fb_tnl_dev) ip6_tnl0_update(t, &p1); else ip6_tnl_update(t, &p1); } if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { err = PTR_ERR(t); } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } /** * ip6_tnl_change_mtu - change mtu manually for tunnel device * @dev: virtual device associated with tunnel * @new_mtu: the new mtu * * Return: * 0 on success, * %-EINVAL if mtu too small **/ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); int t_hlen; t_hlen = tnl->hlen + sizeof(struct ipv6hdr); if (tnl->parms.proto == IPPROTO_IPV6) { if (new_mtu < IPV6_MIN_MTU) return -EINVAL; } else { if (new_mtu < ETH_MIN_MTU) return -EINVAL; } if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) { if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } else { if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); return READ_ONCE(t->parms.link); } EXPORT_SYMBOL(ip6_tnl_get_iflink); int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip6_tnl_encap_add_ops); int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip6_tnl_encap_ops **) &ip6tun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); int ip6_tnl_encap_setup(struct ip6_tnl *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip6_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); dev->features |= IPXIPX_FEATURES; dev->hw_features |= IPXIPX_FEATURES; /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * ip6_tnl_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; int t_hlen; t->dev = dev; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) return ret; ret = gro_cells_init(&t->gro_cells, dev); if (ret) goto destroy_dst; t->tun_hlen = 0; t->hlen = t->encap_hlen + t->tun_hlen; t_hlen = t->hlen + sizeof(struct ipv6hdr); dev->type = ARPHRD_TUNNEL6; dev->mtu = ETH_DATA_LEN - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; destroy_dst: dst_cache_destroy(&t->dst_cache); return ret; } /** * ip6_tnl_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = ip6_tnl_dev_init_gen(dev); if (err) return err; ip6_tnl_link_config(t); if (t->parms.collect_md) netif_keep_dst(dev); return 0; } /** * ip6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (proto != IPPROTO_IPV6 && proto != IPPROTO_IPIP && proto != 0) return -EINVAL; return 0; } static void ip6_tnl_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); if (data[IFLA_IPTUN_ENCAP_LIMIT]) parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); if (data[IFLA_IPTUN_FLOWINFO]) parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); if (data[IFLA_IPTUN_FLAGS]) parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; if (data[IFLA_IPTUN_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static int ip6_tnl_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; struct net *net; int err; net = params->link_net ? : dev_net(dev); ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &nt->parms); if (nt->parms.collect_md) { if (rtnl_dereference(ip6n->collect_md_tun)) return -EEXIST; } else { t = ip6_tnl_locate(net, &nt->parms, 0); if (!IS_ERR(t)) return -EEXIST; } err = ip6_tnl_create2(dev); if (!err && tb[IFLA_MTU]) ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); return err; } static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t = netdev_priv(dev); struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) return err; } ip6_tnl_netlink_parms(data, &p); if (p.collect_md) return -EINVAL; t = ip6_tnl_locate(net, &p, 0); if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ip6_tnl_update(t, &p); return 0; } static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static size_t ip6_tnl_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_IPTUN_FLOWINFO */ nla_total_size(4) + /* IFLA_IPTUN_FLAGS */ nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } struct net *ip6_tnl_get_link_net(const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); return READ_ONCE(tunnel->net); } EXPORT_SYMBOL(ip6_tnl_get_link_net); static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { .kind = "ip6tnl", .maxtype = IFLA_IPTUN_MAX, .policy = ip6_tnl_policy, .priv_size = sizeof(struct ip6_tnl), .setup = ip6_tnl_dev_setup, .validate = ip6_tnl_validate, .newlink = ip6_tnl_newlink, .changelink = ip6_tnl_changelink, .dellink = ip6_tnl_dellink, .get_size = ip6_tnl_get_size, .fill_info = ip6_tnl_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, }; static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .handler = mplsip6_rcv, .err_handler = mplsip6_err, .priority = 1, }; static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; int h; struct ip6_tnl *t; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &ip6_link_ops) unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); t = rtnl_net_dereference(net, t->next); } } static int __net_init ip6_tnl_init_net(struct net *net) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; /** * ip6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init ip6_tunnel_init(void) { int err; if (!ipv6_mod_enabled()) return -EOPNOTSUPP; err = register_pernet_device(&ip6_tnl_net_ops); if (err < 0) goto out_pernet; err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); if (err < 0) { pr_err("%s: can't register ip4ip6\n", __func__); goto out_ip4ip6; } err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); if (err < 0) { pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } if (ip6_tnl_mpls_supported()) { err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); if (err < 0) { pr_err("%s: can't register mplsip6\n", __func__); goto out_mplsip6; } } err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: if (ip6_tnl_mpls_supported()) xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); out_ip4ip6: unregister_pernet_device(&ip6_tnl_net_ops); out_pernet: return err; } /** * ip6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit ip6_tunnel_cleanup(void) { rtnl_link_unregister(&ip6_link_ops); if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) pr_info("%s: can't deregister ip4ip6\n", __func__); if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); if (ip6_tnl_mpls_supported() && xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } module_init(ip6_tunnel_init); module_exit(ip6_tunnel_cleanup);
15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions Copyright (C) 1992 Drew Eckhardt */ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H #include <linux/types.h> #include <linux/blk_types.h> #include <linux/device.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/minmax.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/wait.h> #include <linux/bio.h> #include <linux/gfp.h> #include <linux/kdev_t.h> #include <linux/rcupdate.h> #include <linux/percpu-refcount.h> #include <linux/blkzoned.h> #include <linux/sched.h> #include <linux/sbitmap.h> #include <linux/uuid.h> #include <linux/xarray.h> #include <linux/file.h> #include <linux/lockdep.h> struct module; struct request_queue; struct elevator_queue; struct blk_trace; struct request; struct sg_io_hdr; struct blkcg_gq; struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ #define BLKCG_MAX_POLS 6 #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. * EFI UUID is 36 characters. MSDOS UUID is 11 characters. */ #define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) struct partition_meta_info { char uuid[PARTITION_META_INFO_UUIDLTH]; u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; /** * DOC: genhd capability flags * * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to * removable media. When set, the device remains present even when media is not * inserted. Shall not be set for devices which are removed entirely when the * media is removed. * * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, * doesn't appear in sysfs, and can't be opened from userspace or using * blkdev_get*. Used for the underlying components of multipath devices. * * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not * scan for partitions from add_disk, and users can't add partitions manually. * */ enum { GENHD_FL_REMOVABLE = 1 << 0, GENHD_FL_HIDDEN = 1 << 1, GENHD_FL_NO_PART = 1 << 2, }; enum { DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ }; enum { /* Poll even if events_poll_msecs is unset */ DISK_EVENT_FLAG_POLL = 1 << 0, /* Forward events to udev */ DISK_EVENT_FLAG_UEVENT = 1 << 1, /* Block event polling when open for exclusive write */ DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, }; struct disk_events; struct badblocks; enum blk_integrity_checksum { BLK_INTEGRITY_CSUM_NONE = 0, BLK_INTEGRITY_CSUM_IP = 1, BLK_INTEGRITY_CSUM_CRC = 2, BLK_INTEGRITY_CSUM_CRC64 = 3, } __packed ; struct blk_integrity { unsigned char flags; enum blk_integrity_checksum csum_type; unsigned char tuple_size; unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; typedef unsigned int __bitwise blk_mode_t; /* open for reading */ #define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) /* open for writing */ #define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) /* open exclusively (vs other exclusive openers */ #define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) /* opened with O_NDELAY */ #define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) /* open for "writes" only for ioctls (specialy hack for floppy.c) */ #define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) /* open is exclusive wrt all other BLK_OPEN_WRITE opens to the device */ #define BLK_OPEN_RESTRICT_WRITES ((__force blk_mode_t)(1 << 5)) /* return partition scanning errors */ #define BLK_OPEN_STRICT_SCAN ((__force blk_mode_t)(1 << 6)) struct gendisk { /* * major/first_minor/minors should not be set by any new driver, the * block core will take care of allocating them automatically. */ int major; int first_minor; int minors; char disk_name[DISK_NAME_LEN]; /* name of major driver */ unsigned short events; /* supported events */ unsigned short event_flags; /* flags related to event processing */ struct xarray part_tbl; struct block_device *part0; const struct block_device_operations *fops; struct request_queue *queue; void *private_data; struct bio_set bio_split; int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_DEAD 2 #define GD_NATIVE_CAPACITY 3 #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; #endif struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information. Reads of this information must be * protected with blk_queue_enter() / blk_queue_exit(). Modifying this * information is only allowed while no requests are being processed. * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) struct cdrom_device_info *cdi; #endif int node_id; struct badblocks *bb; struct lockdep_map lockdep_map; u64 diskseq; blk_mode_t open_mode; /* * Independent sector access ranges. This is always NULL for * devices that do not have multiple independent access ranges. */ struct blk_independent_access_ranges *ia_ranges; }; /** * disk_openers - returns how many openers are there for a disk * @disk: disk to check * * This returns the number of openers for a disk. Note that this value is only * stable if disk->open_mutex is held. * * Note: Due to a quirk in the block layer open code, each open partition is * only counted once even if there are multiple openers. */ static inline unsigned int disk_openers(struct gendisk *disk) { return atomic_read(&disk->part0->bd_openers); } /** * disk_has_partscan - return %true if partition scanning is enabled on a disk * @disk: disk to check * * Returns %true if partitions scanning is enabled for @disk, or %false if * partition scanning is disabled either permanently or temporarily. */ static inline bool disk_has_partscan(struct gendisk *disk) { return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); } /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. */ #define dev_to_disk(device) \ (dev_to_bdev(device)->bd_disk) #define disk_to_dev(disk) \ (&((disk)->part0->bd_device)) #if IS_REACHABLE(CONFIG_CDROM) #define disk_to_cdi(disk) ((disk)->cdi) #else #define disk_to_cdi(disk) NULL #endif static inline dev_t disk_devt(struct gendisk *disk) { return MKDEV(disk->major, disk->first_minor); } /* * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER) * however we constrain this to what we can validate and test. */ #define BLK_MAX_BLOCK_SIZE SZ_64K /* blk_validate_limits() validates bsize, so drivers don't usually need to */ static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; } static inline bool blk_op_is_passthrough(blk_opf_t op) { op &= REQ_OP_MASK; return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } /* flags set by the driver in queue_limits.features */ typedef unsigned int __bitwise blk_features_t; /* supports a volatile write cache */ #define BLK_FEAT_WRITE_CACHE ((__force blk_features_t)(1u << 0)) /* supports passing on the FUA bit */ #define BLK_FEAT_FUA ((__force blk_features_t)(1u << 1)) /* rotational device (hard drive or floppy) */ #define BLK_FEAT_ROTATIONAL ((__force blk_features_t)(1u << 2)) /* contributes to the random number pool */ #define BLK_FEAT_ADD_RANDOM ((__force blk_features_t)(1u << 3)) /* do disk/partitions IO accounting */ #define BLK_FEAT_IO_STAT ((__force blk_features_t)(1u << 4)) /* don't modify data until writeback is done */ #define BLK_FEAT_STABLE_WRITES ((__force blk_features_t)(1u << 5)) /* always completes in submit context */ #define BLK_FEAT_SYNCHRONOUS ((__force blk_features_t)(1u << 6)) /* supports REQ_NOWAIT */ #define BLK_FEAT_NOWAIT ((__force blk_features_t)(1u << 7)) /* supports DAX */ #define BLK_FEAT_DAX ((__force blk_features_t)(1u << 8)) /* supports I/O polling */ #define BLK_FEAT_POLL ((__force blk_features_t)(1u << 9)) /* is a zoned device */ #define BLK_FEAT_ZONED ((__force blk_features_t)(1u << 10)) /* supports PCI(e) p2p requests */ #define BLK_FEAT_PCI_P2PDMA ((__force blk_features_t)(1u << 12)) /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) /* bounce all highmem pages */ #define BLK_FEAT_BOUNCE_HIGH ((__force blk_features_t)(1u << 14)) /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) /* atomic writes enabled */ #define BLK_FEAT_ATOMIC_WRITES \ ((__force blk_features_t)(1u << 16)) /* * Flags automatically inherited when stacking limits. */ #define BLK_FEAT_INHERIT_MASK \ (BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \ BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH | \ BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE) /* internal flags in queue_limits.flags */ typedef unsigned int __bitwise blk_flags_t; /* do not send FLUSH/FUA commands despite advertising a write cache */ #define BLK_FLAG_WRITE_CACHE_DISABLED ((__force blk_flags_t)(1u << 0)) /* I/O topology is misaligned */ #define BLK_FLAG_MISALIGNED ((__force blk_flags_t)(1u << 1)) /* passthrough command IO accounting */ #define BLK_FLAG_IOSTATS_PASSTHROUGH ((__force blk_flags_t)(1u << 2)) struct queue_limits { blk_features_t features; blk_flags_t flags; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; unsigned int max_hw_sectors; unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_hw_zone_append_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; unsigned int zone_write_granularity; /* atomic write limits */ unsigned int atomic_write_hw_max; unsigned int atomic_write_max_sectors; unsigned int atomic_write_hw_boundary; unsigned int atomic_write_boundary_sectors; unsigned int atomic_write_hw_unit_min; unsigned int atomic_write_unit_min; unsigned int atomic_write_hw_unit_max; unsigned int atomic_write_unit_max; unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; unsigned int max_open_zones; unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to * handle individual bvec's that are not a multiple of a SECTOR_SIZE * due to possible offsets. */ unsigned int dma_alignment; unsigned int dma_pad_mask; struct blk_integrity integrity; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes * a range of contiguous sectors that can be accessed using device command * execution resources that are independent from the resources used for * other access ranges. This is typically found with single-LUN multi-actuator * HDDs where each access range is served by a different set of heads. * The set of independent ranges supported by the device is defined using * struct blk_independent_access_ranges. The independent ranges must not overlap * and must include all sectors within the disk capacity (no sector holes * allowed). * For a device with multiple ranges, requests targeting sectors in different * ranges can be executed in parallel. A request can straddle an access range * boundary. */ struct blk_independent_access_range { struct kobject kobj; sector_t sector; sector_t nr_sectors; }; struct blk_independent_access_ranges { struct kobject kobj; bool sysfs_registered; unsigned int nr_ia_ranges; struct blk_independent_access_range ia_range[]; }; struct request_queue { /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata; struct elevator_queue *elevator; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags; unsigned int rq_timeout; unsigned int queue_depth; refcount_t refs; /* hw dispatch queues */ unsigned int nr_hw_queues; struct xarray hctx_table; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; struct lockdep_map io_lockdep_map; struct lock_class_key q_lock_cls_key; struct lockdep_map q_lockdep_map; struct request *last_merge; spinlock_t queue_lock; int quiesce_depth; struct gendisk *disk; /* * mq queue kobject */ struct kobject *mq_kobj; struct queue_limits limits; #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; #endif /* * Number of contexts that have called blk_set_pm_only(). If this * counter is above zero then only RQF_PM requests are processed. */ atomic_t pm_only; struct blk_queue_stats *stats; struct rq_qos *rq_qos; struct mutex rq_qos_mutex; /* * ida allocated id for this queue. Used to index queues from * ioctx. */ int id; /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; struct kobject *crypto_kobject; #endif struct timer_list timeout; struct work_struct timeout_work; atomic_t nr_active_requests_shared_tags; struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; struct mutex blkcg_mutex; #endif int node; spinlock_t requeue_lock; struct list_head requeue_list; struct delayed_work requeue_work; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif /* * for flush operations */ struct blk_flush_queue *fq; struct list_head flush_list; /* * Protects against I/O scheduler switching, particularly when updating * q->elevator. Since the elevator update code path may also modify q-> * nr_requests and wbt latency, this lock also protects the sysfs attrs * nr_requests and wbt_lat_usec. Additionally the nr_hw_queues update * may modify hctx tags, reserved-tags and cpumask, so this lock also * helps protect the hctx sysfs/debugfs attrs. To ensure proper locking * order during an elevator or nr_hw_queue update, first freeze the * queue, then acquire ->elevator_lock. */ struct mutex elevator_lock; struct mutex sysfs_lock; /* * Protects queue limits and also sysfs attribute read_ahead_kb. */ struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating * nr_hw_queues */ struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; int mq_freeze_depth; #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif struct rcu_head rcu_head; #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; /* * Records disk & queue state in current context, used in unfreeze * queue */ bool mq_freeze_disk_dead; bool mq_freeze_queue_dying; #endif wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by * percpu_ref_kill() and percpu_ref_reinit(). */ struct mutex mq_freeze_lock; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct dentry *debugfs_dir; struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; /* * Serializes all debugfs metadata operations using the above dentries. */ struct mutex debugfs_mutex; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ enum { QUEUE_FLAG_DYING, /* queue being torn down */ QUEUE_FLAG_NOMERGES, /* disable merge attempts */ QUEUE_FLAG_SAME_COMP, /* complete on same CPU-group */ QUEUE_FLAG_FAIL_IO, /* fake timeout */ QUEUE_FLAG_NOXMERGES, /* No extended merges */ QUEUE_FLAG_SAME_FORCE, /* force complete on same CPU */ QUEUE_FLAG_INIT_DONE, /* queue is initialized */ QUEUE_FLAG_STATS, /* track IO start and completion times */ QUEUE_FLAG_REGISTERED, /* queue has been registered to a disk */ QUEUE_FLAG_QUIESCED, /* queue has been quiesced */ QUEUE_FLAG_RQ_ALLOC_TIME, /* record rq->alloc_time_ns */ QUEUE_FLAG_HCTX_ACTIVE, /* at least one blk-mq hctx is active */ QUEUE_FLAG_SQ_SCHED, /* single queue style io dispatch */ QUEUE_FLAG_MAX }; #define QUEUE_FLAG_MQ_DEFAULT (1UL << QUEUE_FLAG_SAME_COMP) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) #define blk_queue_dax(q) ((q)->limits.features & BLK_FEAT_DAX) #define blk_queue_pci_p2pdma(q) ((q)->limits.features & BLK_FEAT_PCI_P2PDMA) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) #else #define blk_queue_rq_alloc_time(q) false #endif #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) #define blk_queue_skip_tagset_quiesce(q) \ ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; } #ifdef CONFIG_PM static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return q->rpm_status; } #else static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return RPM_ACTIVE; } #endif static inline bool blk_queue_is_zoned(struct request_queue *q) { return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (q->limits.features & BLK_FEAT_ZONED); } static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { if (!blk_queue_is_zoned(disk->queue)) return 0; return sector >> ilog2(disk->queue->limits.chunk_sectors); } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_active_zones; } static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) return q->queue_depth; return q->nr_requests; } /* * default timeout for SG_IO if none specified */ #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode); int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups); static inline int __must_check add_disk(struct gendisk *disk) { return device_add_disk(NULL, disk, NULL); } void del_gendisk(struct gendisk *gp); void invalidate_disk(struct gendisk *disk); void set_disk_ro(struct gendisk *disk, bool read_only); void disk_uevent(struct gendisk *disk, enum kobject_action action); static inline u8 bdev_partno(const struct block_device *bdev) { return atomic_read(&bdev->__bd_flags) & BD_PARTNO; } static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag) { return atomic_read(&bdev->__bd_flags) & flag; } static inline void bdev_set_flag(struct block_device *bdev, unsigned flag) { atomic_or(flag, &bdev->__bd_flags); } static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) { atomic_andnot(flag, &bdev->__bd_flags); } static inline bool get_disk_ro(struct gendisk *disk) { return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } static inline bool bdev_read_only(struct block_device *bdev) { return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); void disk_force_media_change(struct gendisk *disk); void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { return bdev->bd_start_sect; } static inline sector_t bdev_nr_sectors(struct block_device *bdev) { return bdev->bd_nr_sectors; } static inline loff_t bdev_nr_bytes(struct block_device *bdev) { return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; } static inline sector_t get_capacity(struct gendisk *disk) { return bdev_nr_sectors(disk->part0); } static inline u64 sb_bdev_nr_blocks(struct super_block *sb) { return bdev_nr_sectors(sb->s_bdev) >> (sb->s_blocksize_bits - SECTOR_SHIFT); } #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int disk_nr_zones(struct gendisk *disk) { return disk->nr_zones; } bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); /** * disk_zone_capacity - returns the zone capacity of zone containing @sector * @disk: disk to work with * @sector: sector number within the querying zone * * Returns the zone capacity of a zone containing @sector. @sector can be any * sector in the zone. */ static inline unsigned int disk_zone_capacity(struct gendisk *disk, sector_t sector) { sector_t zone_sectors = disk->queue->limits.chunk_sectors; if (sector + zone_sectors >= get_capacity(disk)) return disk->last_zone_capacity; return disk->zone_capacity; } static inline unsigned int bdev_zone_capacity(struct block_device *bdev, sector_t pos) { return disk_zone_capacity(bdev->bd_disk, pos); } #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) { return false; } #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int bdev_nr_zones(struct block_device *bdev) { return disk_nr_zones(bdev->bd_disk); } int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * * Returns an ERR_PTR on error, else the allocated disk. * * Context: can sleep */ #define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)); #define register_blkdev(major, name) \ __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); bool disk_check_media_change(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_queue; /* this is never NULL */ } /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) { return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } static inline bool bio_straddles_zones(struct bio *bio) { return bio_sectors(bio) && bio_zone_no(bio) != disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } /* * Return how much within the boundary is left to be used for I/O at a given * offset. */ static inline unsigned int blk_boundary_sectors_left(sector_t offset, unsigned int boundary_sectors) { if (unlikely(!is_power_of_2(boundary_sectors))) return boundary_sectors - sector_div(offset, boundary_sectors); return boundary_sectors - (offset & (boundary_sectors - 1)); } /** * queue_limits_start_update - start an atomic update of queue limits * @q: queue to update * * This functions starts an atomic update of the queue limits. It takes a lock * to prevent other updates and returns a snapshot of the current limits that * the caller can modify. The caller must call queue_limits_commit_update() * to finish the update. * * Context: process context. */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) { mutex_lock(&q->limits_lock); return q->limits; } int queue_limits_commit_update_frozen(struct request_queue *q, struct queue_limits *lim); int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); int blk_validate_limits(struct queue_limits *lim); /** * queue_limits_cancel_update - cancel an atomic update of queue limits * @q: queue to update * * This functions cancels an atomic update of the queue limits started by * queue_limits_start_update() and should be used when an error occurs after * starting update. */ static inline void queue_limits_cancel_update(struct request_queue *q) { mutex_unlock(&q->limits_lock); } /* * These helpers are for drivers that have sloppy feature negotiation and might * have to disable DISCARD, WRITE_ZEROES or SECURE_DISCARD from the I/O * completion handler when the device returned an indicator that the respective * feature is not actually supported. They are racy and the driver needs to * cope with that. Try to avoid this scheme if you can. */ static inline void blk_queue_disable_discard(struct request_queue *q) { q->limits.max_discard_sectors = 0; } static inline void blk_queue_disable_secure_erase(struct request_queue *q) { q->limits.max_secure_erase_sectors = 0; } static inline void blk_queue_disable_write_zeroes(struct request_queue *q) { q->limits.max_write_zeroes_sectors = 0; } /* * Access functions for manipulating queue properties */ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); struct blk_independent_access_ranges * disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); void blk_mark_disk_dead(struct gendisk *disk); struct rq_list { struct request *head; struct request *tail; }; #ifdef CONFIG_BLOCK /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests * into single larger request. As the requests are moved from a per-task list to * the device's request_queue in a batch, this results in improved scalability * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list * or when attempting a merge. For details, please see schedule() where * blk_flush_plug() is called. */ struct blk_plug { struct rq_list mq_list; /* blk-mq requests */ /* if ios_left is > 1, we can batch tag/rq allocations */ struct rq_list cached_rqs; u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; bool multiple_queues; bool has_elevator; struct list_head cb_list; /* md requires an unplug callback */ }; struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); struct blk_plug_cb { struct list_head list; blk_plug_cb_fn callback; void *data; }; extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); void __blk_flush_plug(struct blk_plug *plug, bool from_schedule); static inline void blk_flush_plug(struct blk_plug *plug, bool async) { if (plug) __blk_flush_plug(plug, async); } /* * tsk == current here */ static inline void blk_plug_invalidate_ts(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; if (plug) plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { }; static inline void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) { } static inline void blk_start_plug(struct blk_plug *plug) { } static inline void blk_finish_plug(struct blk_plug *plug) { } static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } static inline void blk_plug_invalidate_ts(struct task_struct *tsk) { } static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; } static inline long nr_blockdev_pages(void) { return 0; } #endif /* CONFIG_BLOCK */ extern void blk_io_schedule(void); int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop); int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ #define BLKDEV_ZERO_KILLABLE (1 << 2) /* interruptible by fatal signals */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask) { return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask, 0); } static inline bool bdev_is_partition(struct block_device *bdev) { return bdev_partno(bdev) != 0; } enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; /* * Default upper limit for the software max_sectors limit used for * regular file system I/O. This can be increased through sysfs. * * Not to be confused with the max_hw_sector limit that is entirely * controlled by the driver, usually based on hardware limits. */ #define BLK_DEF_MAX_SECTORS_CAP 2560u static inline struct queue_limits *bdev_limits(struct block_device *bdev) { return &bdev_get_queue(bdev)->limits; } static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; } static inline unsigned long queue_virt_boundary(const struct request_queue *q) { return q->limits.virt_boundary_mask; } static inline unsigned int queue_max_sectors(const struct request_queue *q) { return q->limits.max_sectors; } static inline unsigned int queue_max_bytes(struct request_queue *q) { return min_t(unsigned int, queue_max_sectors(q), INT_MAX >> 9) << 9; } static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; } static inline unsigned short queue_max_segments(const struct request_queue *q) { return q->limits.max_segments; } static inline unsigned short queue_max_discard_segments(const struct request_queue *q) { return q->limits.max_discard_segments; } static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; } static inline bool queue_emulates_zone_append(struct request_queue *q) { return blk_queue_is_zoned(q) && !q->limits.max_hw_zone_append_sectors; } static inline bool bdev_emulates_zone_append(struct block_device *bdev) { return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int bdev_max_zone_append_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_zone_append_sectors; } static inline unsigned int bdev_max_segments(struct block_device *bdev) { return queue_max_segments(bdev_get_queue(bdev)); } static inline unsigned queue_logical_block_size(const struct request_queue *q) { return q->limits.logical_block_size; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) { return queue_logical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_physical_block_size(const struct request_queue *q) { return q->limits.physical_block_size; } static inline unsigned int bdev_physical_block_size(struct block_device *bdev) { return queue_physical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_io_min(const struct request_queue *q) { return q->limits.io_min; } static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } static inline unsigned int queue_io_opt(const struct request_queue *q) { return q->limits.io_opt; } static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } static inline unsigned int queue_zone_write_granularity(const struct request_queue *q) { return q->limits.zone_write_granularity; } static inline unsigned int bdev_zone_write_granularity(struct block_device *bdev) { return queue_zone_write_granularity(bdev_get_queue(bdev)); } int bdev_alignment_offset(struct block_device *bdev); unsigned int bdev_discard_alignment(struct block_device *bdev); static inline unsigned int bdev_max_discard_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_discard_sectors; } static inline unsigned int bdev_discard_granularity(struct block_device *bdev) { return bdev_limits(bdev)->discard_granularity; } static inline unsigned int bdev_max_secure_erase_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_secure_erase_sectors; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { return bdev_limits(bdev)->max_write_zeroes_sectors; } static inline bool bdev_nonrot(struct block_device *bdev) { return blk_queue_nonrot(bdev_get_queue(bdev)); } static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } static inline bool bdev_stable_writes(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) return true; return q->limits.features & BLK_FEAT_STABLE_WRITES; } static inline bool blk_queue_write_cache(struct request_queue *q) { return (q->limits.features & BLK_FEAT_WRITE_CACHE) && !(q->limits.flags & BLK_FLAG_WRITE_CACHE_DISABLED); } static inline bool bdev_write_cache(struct block_device *bdev) { return blk_queue_write_cache(bdev_get_queue(bdev)); } static inline bool bdev_fua(struct block_device *bdev) { return bdev_limits(bdev)->features & BLK_FEAT_FUA; } static inline bool bdev_nowait(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_NOWAIT; } static inline bool bdev_is_zoned(struct block_device *bdev) { return blk_queue_is_zoned(bdev_get_queue(bdev)); } static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) { return disk_zone_no(bdev->bd_disk, sec); } static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (!blk_queue_is_zoned(q)) return 0; return q->limits.chunk_sectors; } static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { return sector & (bdev_zone_sectors(bdev) - 1); } static inline sector_t bio_offset_from_zone_start(struct bio *bio) { return bdev_offset_from_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector); } static inline bool bdev_is_zone_start(struct block_device *bdev, sector_t sector) { return bdev_offset_from_zone_start(bdev, sector) == 0; } /** * bdev_zone_is_seq - check if a sector belongs to a sequential write zone * @bdev: block device to check * @sector: sector number * * Check if @sector on @bdev is contained in a sequential write required zone. */ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) { bool is_seq = false; #if IS_ENABLED(CONFIG_BLK_DEV_ZONED) if (bdev_is_zoned(bdev)) { struct gendisk *disk = bdev->bd_disk; unsigned long *bitmap; rcu_read_lock(); bitmap = rcu_dereference(disk->conv_zones_bitmap); is_seq = !bitmap || !test_bit(disk_zone_no(disk, sector), bitmap); rcu_read_unlock(); } #endif return is_seq; } int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; } static inline unsigned int queue_atomic_write_unit_max_bytes(const struct request_queue *q) { return q->limits.atomic_write_unit_max; } static inline unsigned int queue_atomic_write_unit_min_bytes(const struct request_queue *q) { return q->limits.atomic_write_unit_min; } static inline unsigned int queue_atomic_write_boundary_bytes(const struct request_queue *q) { return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; } static inline unsigned int queue_atomic_write_max_bytes(const struct request_queue *q) { return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; } static inline unsigned int bdev_dma_alignment(struct block_device *bdev) { return queue_dma_alignment(bdev_get_queue(bdev)); } static inline bool bdev_iter_is_aligned(struct block_device *bdev, struct iov_iter *iter) { return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), bdev_logical_block_size(bdev) - 1); } static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { return lim->dma_alignment | lim->dma_pad_mask; } static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); return !(addr & alignment) && !(len & alignment); } /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") #ifdef CONFIG_BLK_INLINE_ENCRYPTION bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_crypto_register(struct blk_crypto_profile *profile, struct request_queue *q) { return true; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ enum blk_unique_id { /* these match the Designator Types specified in SPC */ BLK_UID_T10 = 1, BLK_UID_EUI64 = 2, BLK_UID_NAA = 3, }; struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int (*open)(struct gendisk *disk, blk_mode_t mode); void (*release)(struct gendisk *disk); int (*ioctl)(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg); int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); struct module *owner; const struct pr_ops *pr_ops; /* * Special callback for probing GPT entry at a given sector. * Needed by Android devices, used by GPT scanner and MMC blk * driver. */ int (*alternative_gpt_sector)(struct gendisk *disk, sector_t *sector); }; #ifdef CONFIG_COMPAT extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t, unsigned int, unsigned long); #else #define blkdev_compat_ptr_ioctl NULL #endif static inline void blk_wake_io_task(struct task_struct *waiter) { /* * If we're polling, the task itself is doing the completions. For * that case, we don't need to signal a wakeup, it's enough to just * mark us as RUNNING. */ if (waiter == current) __set_current_state(TASK_RUNNING); else wake_up_process(waiter); } unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned int sectors, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); /** * bio_end_io_acct - end I/O accounting for bio based drivers * @bio: bio to end account for * @start_time: start time returned by bio_start_io_acct() */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); void blkdev_show(struct seq_file *seqf, off_t offset); #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 #else #define BLKDEV_MAJOR_MAX 0 #endif struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev, bool surprise); /* * Sync the file system mounted on the block device. */ void (*sync)(struct block_device *bdev); /* * Freeze the file system mounted on the block device. */ int (*freeze)(struct block_device *bdev); /* * Thaw the file system mounted on the block device. */ int (*thaw)(struct block_device *bdev); }; /* * For filesystems using @fs_holder_ops, the @holder argument passed to * helpers used to open and claim block devices via * bd_prepare_to_claim() must point to a superblock. */ extern const struct blk_holder_ops fs_holder_ops; /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ (BLK_OPEN_READ | BLK_OPEN_RESTRICT_WRITES | \ (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); /* just for blk-cgroup, don't use elsewhere */ struct block_device *blkdev_get_no_open(dev_t dev); void blkdev_put_no_open(struct block_device *bdev); struct block_device *I_BDEV(struct inode *inode); struct block_device *file_bdev(struct file *bdev_file); bool disk_live(struct gendisk *disk); unsigned int block_size(struct block_device *bdev); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else static inline void invalidate_bdev(struct block_device *bdev) { } static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } static inline void sync_bdevs(bool wait) { } static inline void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) { } static inline void printk_all_partitions(void) { } static inline int early_lookup_bdev(const char *pathname, dev_t *dev) { return -EINVAL; } #endif /* CONFIG_BLOCK */ int bdev_freeze(struct block_device *bdev); int bdev_thaw(struct block_device *bdev); void bdev_fput(struct file *bdev_file); struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, struct queue_limits *limits) { unsigned int alignment = max(limits->atomic_write_hw_unit_min, limits->atomic_write_hw_boundary); return IS_ALIGNED(sector, alignment >> SECTOR_SHIFT); } static inline bool bdev_can_atomic_write(struct block_device *bdev) { struct request_queue *bd_queue = bdev->bd_queue; struct queue_limits *limits = &bd_queue->limits; if (!limits->atomic_write_unit_min) return false; if (bdev_is_partition(bdev)) return blk_atomic_write_start_sect_aligned(bdev->bd_start_sect, limits); return true; } static inline unsigned int bdev_atomic_write_unit_min_bytes(struct block_device *bdev) { if (!bdev_can_atomic_write(bdev)) return 0; return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); } static inline unsigned int bdev_atomic_write_unit_max_bytes(struct block_device *bdev) { if (!bdev_can_atomic_write(bdev)) return 0; return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */
289 290 289 290 290 290 290 290 280 280 226 227 227 227 227 225 144 279 282 280 280 280 280 279 280 280 279 3 280 226 279 277 277 179 178 178 5 3 277 206 205 205 205 205 206 206 274 15 1 3 282 281 282 282 15 4 271 283 281 283 280 74 179 3 3 1 1 1 2 179 137 40 128 168 180 16 180 178 179 179 180 15 168 277 277 180 150 30 180 140 179 168 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 /* SPDX-License-Identifier: GPL-2.0 * * page_pool.c * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <trace/events/page_pool.h> #include "dev.h" #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ #define recycle_stat_inc(pool, __stat) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_inc(s->__stat); \ } while (0) #define recycle_stat_add(pool, __stat, val) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_add(s->__stat, val); \ } while (0) static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_alloc_fast", "rx_pp_alloc_slow", "rx_pp_alloc_slow_ho", "rx_pp_alloc_empty", "rx_pp_alloc_refill", "rx_pp_alloc_waive", "rx_pp_recycle_cached", "rx_pp_recycle_cache_full", "rx_pp_recycle_ring", "rx_pp_recycle_ring_full", "rx_pp_recycle_released_ref", }; /** * page_pool_get_stats() - fetch page pool stats * @pool: pool from which page was allocated * @stats: struct page_pool_stats to fill in * * Retrieve statistics about the page_pool. This API is only available * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. * A pointer to a caller allocated struct page_pool_stats structure * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; if (!stats) return false; /* The caller is responsible to initialize stats. */ stats->alloc_stats.fast += pool->alloc_stats.fast; stats->alloc_stats.slow += pool->alloc_stats.slow; stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; stats->alloc_stats.empty += pool->alloc_stats.empty; stats->alloc_stats.refill += pool->alloc_stats.refill; stats->alloc_stats.waive += pool->alloc_stats.waive; for_each_possible_cpu(cpu) { const struct page_pool_recycle_stats *pcpu = per_cpu_ptr(pool->recycle_stats, cpu); stats->recycle_stats.cached += pcpu->cached; stats->recycle_stats.cache_full += pcpu->cache_full; stats->recycle_stats.ring += pcpu->ring; stats->recycle_stats.ring_full += pcpu->ring_full; stats->recycle_stats.released_refcnt += pcpu->released_refcnt; } return true; } EXPORT_SYMBOL(page_pool_get_stats); u8 *page_pool_ethtool_stats_get_strings(u8 *data) { int i; for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { memcpy(data, pp_stats[i], ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); int page_pool_ethtool_stats_get_count(void) { return ARRAY_SIZE(pp_stats); } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; *data++ = pool_stats->alloc_stats.slow_high_order; *data++ = pool_stats->alloc_stats.empty; *data++ = pool_stats->alloc_stats.refill; *data++ = pool_stats->alloc_stats.waive; *data++ = pool_stats->recycle_stats.cached; *data++ = pool_stats->recycle_stats.cache_full; *data++ = pool_stats->recycle_stats.ring; *data++ = pool_stats->recycle_stats.ring_full; *data++ = pool_stats->recycle_stats.released_refcnt; return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else #define alloc_stat_inc(pool, __stat) #define recycle_stat_inc(pool, __stat) #define recycle_stat_add(pool, __stat, val) #endif static bool page_pool_producer_lock(struct page_pool *pool) __acquires(&pool->ring.producer_lock) { bool in_softirq = in_softirq(); if (in_softirq) spin_lock(&pool->ring.producer_lock); else spin_lock_bh(&pool->ring.producer_lock); return in_softirq; } static void page_pool_producer_unlock(struct page_pool *pool, bool in_softirq) __releases(&pool->ring.producer_lock) { if (in_softirq) spin_unlock(&pool->ring.producer_lock); else spin_unlock_bh(&pool->ring.producer_lock); } static void page_pool_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, PAGE_POOL_FRAG_GROUP_ALIGN); } static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ struct netdev_rx_queue *rxq; int err; page_pool_struct_check(); memcpy(&pool->p, &params->fast, sizeof(pool->p)); memcpy(&pool->slow, &params->slow, sizeof(pool->slow)); pool->cpuid = cpuid; pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) ring_qsize = pool->p.pool_size; /* Sanity limit mem that can be pinned down */ if (ring_qsize > 32768) return -E2BIG; /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; pool->dma_map = true; } if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; pool->dma_sync = true; /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ } pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; } else { /* For system page pool instance we use a singular stats object * instead of allocating a separate percpu variable for each * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; } atomic_set(&pool->pages_state_release_cnt, 0); /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; pool->mp_ops = rxq->mp_params.mp_ops; } if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; goto free_ptr_ring; } err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); goto free_ptr_ring; } static_branch_inc(&page_pool_mem_providers); } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif } /** * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params * @cpuid: cpu identifier */ struct page_pool * page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); if (!pool) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; err = page_pool_list(pool); if (err) goto err_uninit; return pool; err_uninit: page_pool_uninit(pool); err_free: pr_warn("%s() gave up with errno %d\n", __func__, err); kfree(pool); return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create_percpu); /** * page_pool_create() - create a page pool * @params: parameters, see struct page_pool_params */ struct page_pool *page_pool_create(const struct page_pool_params *params) { return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ #ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; #else /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif /* Refill alloc array, but only if NUMA match */ do { netmem = (__force netmem_ref)__ptr_ring_consume(r); if (unlikely(!netmem)) break; if (likely(netmem_is_pref_nid(netmem, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ page_pool_return_page(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } return netmem; } /* fast path */ static netmem_ref __page_pool_get_cached(struct page_pool *pool) { netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { netmem = page_pool_refill_alloc_cache(pool); } return netmem; } static void __page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, dma_sync_size, pool->p.dma_dir); #endif } static __always_inline void page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { rcu_read_lock(); /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ if (pool->dma_sync) __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); rcu_read_unlock(); } } static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; if (page_pool_set_dma_addr_netmem(netmem, dma)) { WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; } if (in_softirq()) err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); else err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); if (err) { WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); goto unset_failed; } netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; unset_failed: page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { struct page *page; gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); if (unlikely(!page)) return NULL; if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page_to_netmem(page), pool->pages_state_hold_cnt); return page; } /* slow path */ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; bool dma_map = pool->dma_map; netmem_ref netmem; int i, nr_pages; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } page_pool_set_pp_info(pool, netmem); pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; /* Fast-path: Get a page from cache */ netmem = __page_pool_get_cached(pool); if (netmem) return netmem; /* Slow-path: cache empty, do real allocation */ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ #define _distance(a, b) (s32)((a) - (b)) s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); s32 inflight; inflight = _distance(hold_cnt, release_cnt); if (strict) { trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); } else { inflight = max(0, inflight); } return inflight; } void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { netmem_set_pp(netmem, pool); netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it * is allocated from the page allocator and page_pool_fragment_page() * is dirtying the same cache line as the page->pp_magic above, so * the overhead is negligible. */ page_pool_fragment_netmem(netmem, 1); if (pool->has_init_callback) pool->slow.init_callback(netmem, pool->slow.init_arg); } void page_pool_clear_pp_info(netmem_ref netmem) { netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { struct page *old, *page = netmem_to_page(netmem); unsigned long id; dma_addr_t dma; if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; id = netmem_get_dma_index(netmem); if (!id) return; if (in_softirq()) old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); else old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); if (old != page) return; dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) { int count; bool put; put = true; if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, netmem, count); if (put) { page_pool_clear_pp_info(netmem); put_page(netmem_to_page(netmem)); } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { int ret; /* BH protection not needed if current is softirq */ if (in_softirq()) ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); else ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); if (!ret) { recycle_stat_inc(pool, ring); return true; } return false; } /* Only allow direct recycling in special circumstances, into the * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. * * Caller must provide appropriate safe context. */ static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { recycle_stat_inc(pool, cache_full); return false; } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { return netmem_is_net_iov(netmem) || (page_ref_count(netmem_to_page(netmem)) == 1 && !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ static __always_inline netmem_ref __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. * * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) return 0; /* Page found as candidate for recycling */ return netmem; } /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some * want to keep doing this to save memory and do refcnt based * recycling. Support this use case too, to ease drivers * switching between XDP/non-XDP. * * In-case page_pool maintains the DMA mapping, API user must * call page_pool_put_page once. In this elevated refcnt * case, the DMA is unmapped/released, as driver is likely * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); page_pool_return_page(pool, netmem); return 0; } static bool page_pool_napi_local(const struct page_pool *pool) { const struct napi_struct *napi; u32 cpuid; if (unlikely(!in_softirq())) return false; /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's * no possible race. * __page_pool_put_page() makes sure we're not in hardirq context * and interrupts are enabled prior to accessing the cache. */ cpuid = smp_processor_id(); if (READ_ONCE(pool->cpuid) == cpuid) return true; napi = READ_ONCE(pool->p.napi); return napi && READ_ONCE(napi->list_owner) == cpuid; } void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { if (!allow_direct) allow_direct = page_pool_napi_local(pool); netmem = __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } EXPORT_SYMBOL(page_pool_put_unrefed_page); static void page_pool_recycle_ring_bulk(struct page_pool *pool, netmem_ref *bulk, u32 bulk_len) { bool in_softirq; u32 i; /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } page_pool_producer_unlock(pool, in_softirq); recycle_stat_add(pool, ring, i); /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; /* * ptr_ring cache is full, free remaining pages outside producer lock * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) page_pool_return_page(pool, bulk[i]); } /** * page_pool_put_netmem_bulk() - release references on multiple netmems * @data: array holding netmem references * @count: number of entries in @data * * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() * will release leftover netmems to the memory provider. * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx * completion loop for the XDP_REDIRECT use case. * * Please note the caller must not use data area after running * page_pool_put_netmem_bulk(), as this function overwrites it. */ void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { u32 bulk_len = 0; for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); if (page_pool_unref_and_test(netmem)) data[bulk_len++] = netmem; } count = bulk_len; while (count) { netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; struct page_pool *pool = NULL; bool allow_direct; u32 foreign = 0; bulk_len = 0; for (u32 i = 0; i < count; i++) { struct page_pool *netmem_pp; netmem_ref netmem = data[i]; netmem_pp = netmem_get_pp(netmem); if (unlikely(!pool)) { pool = netmem_pp; allow_direct = page_pool_napi_local(pool); } else if (netmem_pp != pool) { /* * If the netmem belongs to a different * page_pool, save it for another round. */ data[foreign++] = netmem; continue; } netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) bulk[bulk_len++] = netmem; } if (bulk_len) page_pool_recycle_ring_bulk(pool, bulk, bulk_len); count = foreign; } } EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ if (likely(page_pool_unref_netmem(netmem, drain_count))) return 0; if (__page_pool_page_can_be_recycled(netmem)) { page_pool_dma_sync_for_device(pool, netmem, -1); return netmem; } page_pool_return_page(pool, netmem); return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; netmem_ref netmem = pool->frag_page; pool->frag_page = 0; if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; page_pool_return_page(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } if (!netmem) { netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; } pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; page_pool_fragment_netmem(netmem, BIAS_MAX); return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { netmem_ref netmem; /* Empty recycle ring */ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); page_pool_return_page(pool, netmem); } } static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); page_pool_unlist(pool); page_pool_uninit(pool); if (pool->mp_ops) { pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { netmem_ref netmem; if (pool->destroy_cnt) return; /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_page(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { unsigned long id; void *ptr; page_pool_empty_alloc_cache_once(pool); if (!pool->destroy_cnt++ && pool->dma_map) { if (pool->dma_sync) { /* Disable page_pool_dma_sync_for_device() */ pool->dma_sync = false; /* Make sure all concurrent returns that may see the old * value of dma_sync (and thus perform a sync) have * finished before doing the unmapping below. Skip the * wait if the device doesn't actually need syncing, or * if there are no outstanding mapped pages. */ if (dma_dev_need_sync(pool->p.dev) && !xa_empty(&pool->dma_mapped)) synchronize_net(); } xa_for_each(&pool->dma_mapped, id, ptr) __page_pool_release_page_dma(pool, page_to_netmem(ptr)); } /* No more consumers should exist, but producers could still * be in-flight. */ page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) { int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); if (!inflight) __page_pool_destroy(pool); return inflight; } static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); void *netdev; int inflight; inflight = page_pool_release(pool); /* In rare cases, a driver bug may cause inflight to go negative. * Don't reschedule release if inflight is 0 or negative. * - If 0, the page_pool has been destroyed * - if negative, we will never recover * in both cases no reschedule is necessary. */ if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ netdev = READ_ONCE(pool->slow.netdev); if (time_after_eq(jiffies, pool->defer_warn) && (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } /* Still not ready to be disconnected, retry later */ schedule_delayed_work(&pool->release_dw, DEFER_TIME); } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; napi_assert_will_not_race(pool->p.napi); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { if (!pool) return; if (!page_pool_put(pool)) return; page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) { return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); } /* Associate a niov with a page pool. Should follow with a matching * net_mp_niov_clear_page_pool() */ void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_set_pp_info(pool, netmem); pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Disassociate a niov from a page pool. Should only be used in the * ->release_netmem() path. */ void net_mp_niov_clear_page_pool(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_clear_pp_info(netmem); }
11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 // SPDX-License-Identifier: GPL-2.0 /* * CPU <-> hardware queue mapping helpers * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/group_cpus.h> #include <linux/device/bus.h> #include "blk.h" #include "blk-mq.h" void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { const struct cpumask *masks; unsigned int queue, cpu; masks = group_cpus_evenly(qmap->nr_queues); if (!masks) { for_each_possible_cpu(cpu) qmap->mq_map[cpu] = qmap->queue_offset; return; } for (queue = 0; queue < qmap->nr_queues; queue++) { for_each_cpu(cpu, &masks[queue]) qmap->mq_map[cpu] = qmap->queue_offset + queue; } kfree(masks); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); /** * blk_mq_hw_queue_to_node - Look up the memory node for a hardware queue index * @qmap: CPU to hardware queue map. * @index: hardware queue index. * * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. */ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) { int i; for_each_possible_cpu(i) { if (index == qmap->mq_map[i]) return cpu_to_node(i); } return NUMA_NO_NODE; } /** * blk_mq_map_hw_queues - Create CPU to hardware queue mapping * @qmap: CPU to hardware queue map * @dev: The device to map queues * @offset: Queue offset to use for the device * * Create a CPU to hardware queue mapping in @qmap. The struct bus_type * irq_get_affinity callback will be used to retrieve the affinity. */ void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, struct device *dev, unsigned int offset) { const struct cpumask *mask; unsigned int queue, cpu; if (!dev->bus->irq_get_affinity) goto fallback; for (queue = 0; queue < qmap->nr_queues; queue++) { mask = dev->bus->irq_get_affinity(dev, queue + offset); if (!mask) goto fallback; for_each_cpu(cpu, mask) qmap->mq_map[cpu] = qmap->queue_offset + queue; } return; fallback: blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues);
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Weighted Fail Over module * * Authors: Kenny Mathis <kmathis@chokepoint.net> * * Changes: * Kenny Mathis : added initial functionality based on weight */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> /* Weighted Fail Over Module */ static struct ip_vs_dest * ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *hweight = NULL; int hw = 0; /* Track highest weight */ IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n"); /* Basic failover functionality * Find virtual server with highest weight and send it traffic */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > hw) { hweight = dest; hw = atomic_read(&dest->weight); } } if (hweight) { IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n", IP_VS_DBG_ADDR(hweight->af, &hweight->addr), ntohs(hweight->port), atomic_read(&hweight->activeconns), atomic_read(&hweight->weight)); return hweight; } ip_vs_scheduler_err(svc, "no destination available"); return NULL; } static struct ip_vs_scheduler ip_vs_fo_scheduler = { .name = "fo", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list), .schedule = ip_vs_fo_schedule, }; static int __init ip_vs_fo_init(void) { return register_ip_vs_scheduler(&ip_vs_fo_scheduler); } static void __exit ip_vs_fo_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_fo_scheduler); synchronize_rcu(); } module_init(ip_vs_fo_init); module_exit(ip_vs_fo_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs weighted failover scheduler");
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV4_H_ #define _NF_TABLES_IPV4_H_ #include <net/netfilter/nf_tables.h> #include <net/ip.h> static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; ip = ip_hdr(pkt->skb); pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; pkt->thoff = ip_hdrlen(pkt->skb); pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (skb_len < len) return -1; else if (len < thoff) return -1; else if (thoff < sizeof(*iph)) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { goto inhdr_error; } else if (thoff < sizeof(*iph)) { return -1; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; inhdr_error: __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); return -1; } #endif
64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM icmp #if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ICMP_H #include <linux/icmp.h> #include <linux/tracepoint.h> TRACE_EVENT(icmp_send, TP_PROTO(const struct sk_buff *skb, int type, int code), TP_ARGS(skb, type, code), TP_STRUCT__entry( __field(const void *, skbaddr) __field(int, type) __field(int, code) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __field(__u16, sport) __field(__u16, dport) __field(unsigned short, ulen) ), TP_fast_assign( struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = udp_hdr(skb); int proto_4 = iph->protocol; __be32 *p32; __entry->skbaddr = skb; __entry->type = type; __entry->code = code; if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head || (u8 *)uh + sizeof(struct udphdr) > skb_tail_pointer(skb)) { __entry->sport = 0; __entry->dport = 0; __entry->ulen = 0; } else { __entry->sport = ntohs(uh->source); __entry->dport = ntohs(uh->dest); __entry->ulen = ntohs(uh->len); } p32 = (__be32 *) __entry->saddr; *p32 = iph->saddr; p32 = (__be32 *) __entry->daddr; *p32 = iph->daddr; ), TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p", __entry->type, __entry->code, __entry->saddr, __entry->sport, __entry->daddr, __entry->dport, __entry->ulen, __entry->skbaddr) ); #endif /* _TRACE_ICMP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
15 15 15 15 15 15 665 666 28 607 607 63 63 1666 1665 1664 1146 1149 3 2 2 1 1 28 28 28 28 15 14 15 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 10971 10972 10973 10974 10975 10976 10977 10978 10979 10980 10981 10982 10983 10984 10985 10986 10987 10988 10989 10990 10991 10992 10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079 11080 11081 11082 11083 11084 11085 11086 11087 11088 11089 11090 11091 11092 11093 11094 11095 11096 11097 11098 11099 11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112 11113 11114 11115 11116 11117 11118 11119 11120 11121 11122 11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139 11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151 11152 11153 11154 11155 11156 11157 11158 11159 11160 11161 11162 11163 11164 11165 11166 11167 11168 11169 11170 11171 11172 11173 11174 11175 11176 11177 11178 11179 11180 11181 11182 11183 11184 11185 11186 11187 11188 11189 11190 11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210 11211 11212 11213 11214 11215 11216 11217 11218 11219 11220 11221 11222 11223 11224 11225 11226 11227 11228 11229 11230 11231 11232 11233 11234 11235 11236 11237 11238 11239 11240 11241 11242 11243 11244 11245 11246 11247 11248 11249 11250 11251 11252 11253 11254 11255 11256 11257 11258 11259 11260 11261 11262 11263 11264 11265 11266 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277 11278 11279 11280 11281 11282 11283 11284 11285 11286 11287 11288 11289 11290 11291 11292 11293 11294 11295 11296 11297 11298 11299 11300 11301 11302 11303 11304 11305 11306 11307 11308 11309 11310 11311 11312 11313 11314 11315 11316 11317 11318 11319 11320 11321 11322 11323 11324 11325 11326 11327 11328 11329 11330 11331 11332 11333 11334 11335 11336 11337 11338 11339 11340 11341 11342 11343 11344 11345 11346 11347 11348 11349 11350 11351 11352 11353 11354 11355 11356 11357 11358 11359 11360 11361 11362 11363 11364 11365 11366 11367 11368 11369 11370 11371 11372 11373 11374 11375 11376 11377 11378 11379 11380 11381 11382 11383 11384 11385 11386 11387 11388 11389 11390 11391 11392 11393 11394 11395 11396 11397 11398 11399 11400 11401 11402 11403 11404 11405 11406 11407 11408 11409 11410 11411 11412 11413 11414 11415 11416 11417 11418 11419 11420 11421 11422 11423 11424 11425 11426 11427 11428 11429 11430 11431 11432 11433 11434 11435 11436 11437 11438 11439 11440 11441 11442 11443 11444 11445 11446 11447 11448 11449 11450 11451 11452 11453 11454 11455 11456 11457 11458 11459 11460 11461 11462 11463 11464 11465 11466 11467 11468 11469 11470 11471 11472 11473 11474 11475 11476 11477 11478 11479 11480 11481 11482 11483 11484 11485 11486 11487 11488 11489 11490 11491 11492 11493 11494 11495 11496 11497 11498 11499 11500 11501 11502 11503 11504 11505 11506 11507 11508 11509 11510 11511 11512 11513 11514 11515 11516 11517 11518 11519 11520 11521 11522 11523 11524 11525 11526 11527 11528 11529 11530 11531 11532 11533 11534 11535 11536 11537 11538 11539 11540 11541 11542 11543 11544 11545 11546 11547 11548 11549 11550 11551 11552 11553 11554 11555 11556 11557 11558 11559 11560 11561 11562 11563 11564 11565 11566 11567 11568 11569 11570 11571 11572 11573 11574 11575 11576 11577 11578 11579 11580 11581 11582 11583 11584 11585 11586 11587 11588 11589 11590 11591 11592 11593 11594 11595 11596 11597 11598 11599 11600 11601 11602 11603 11604 11605 11606 11607 11608 11609 11610 11611 11612 11613 11614 11615 11616 11617 11618 11619 11620 11621 11622 11623 11624 11625 11626 11627 11628 11629 11630 11631 11632 11633 11634 11635 11636 11637 11638 11639 11640 11641 11642 11643 11644 11645 11646 11647 11648 11649 11650 11651 11652 11653 11654 11655 11656 11657 11658 11659 11660 11661 11662 11663 11664 11665 11666 11667 11668 11669 11670 11671 11672 11673 11674 11675 11676 11677 11678 11679 11680 11681 11682 11683 11684 11685 11686 11687 11688 11689 11690 11691 11692 11693 11694 11695 11696 11697 11698 11699 11700 11701 11702 11703 11704 11705 11706 11707 11708 11709 11710 11711 11712 11713 11714 11715 11716 11717 11718 11719 11720 11721 11722 11723 11724 11725 11726 11727 11728 11729 11730 11731 11732 11733 11734 11735 11736 11737 11738 11739 11740 11741 11742 11743 11744 11745 11746 11747 11748 11749 11750 11751 11752 11753 11754 11755 11756 11757 11758 11759 11760 11761 11762 11763 11764 11765 11766 11767 11768 11769 11770 11771 11772 11773 11774 11775 11776 11777 11778 11779 11780 11781 11782 11783 11784 11785 11786 11787 11788 11789 11790 11791 11792 11793 11794 11795 11796 11797 11798 11799 11800 11801 11802 11803 11804 11805 11806 11807 11808 11809 11810 11811 11812 11813 11814 11815 11816 11817 11818 11819 11820 11821 11822 11823 11824 11825 11826 11827 11828 11829 11830 11831 11832 11833 11834 11835 11836 11837 11838 11839 11840 11841 11842 11843 11844 11845 11846 11847 11848 11849 11850 11851 11852 11853 11854 11855 11856 11857 11858 11859 11860 11861 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 11872 11873 11874 11875 11876 11877 11878 11879 11880 11881 11882 11883 11884 11885 11886 11887 11888 11889 11890 11891 11892 11893 11894 11895 11896 11897 11898 11899 11900 11901 11902 11903 11904 11905 11906 11907 11908 11909 11910 11911 11912 11913 11914 11915 11916 11917 11918 11919 11920 11921 11922 11923 11924 11925 11926 11927 11928 11929 11930 11931 11932 11933 11934 11935 11936 11937 11938 11939 11940 11941 11942 11943 11944 11945 11946 11947 11948 11949 11950 11951 11952 11953 11954 11955 11956 11957 11958 11959 11960 11961 11962 11963 11964 11965 11966 11967 11968 11969 11970 11971 11972 11973 11974 11975 11976 11977 11978 11979 11980 11981 11982 11983 11984 11985 11986 11987 11988 11989 11990 11991 11992 11993 11994 11995 11996 11997 11998 11999 12000 12001 12002 12003 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015 12016 12017 12018 12019 12020 12021 12022 12023 12024 12025 12026 12027 12028 12029 12030 12031 12032 12033 12034 12035 12036 12037 12038 12039 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050 12051 12052 12053 12054 12055 12056 12057 12058 12059 12060 12061 12062 12063 12064 12065 12066 12067 12068 12069 12070 12071 12072 12073 12074 12075 12076 12077 12078 12079 12080 12081 12082 12083 12084 12085 12086 12087 12088 12089 12090 12091 12092 12093 12094 12095 12096 12097 12098 12099 12100 12101 12102 12103 12104 12105 12106 12107 12108 12109 12110 12111 12112 12113 12114 12115 12116 12117 12118 12119 12120 12121 12122 12123 12124 12125 12126 12127 12128 12129 12130 12131 12132 12133 12134 12135 12136 12137 12138 12139 12140 12141 12142 12143 12144 12145 12146 12147 12148 12149 12150 12151 12152 12153 12154 12155 12156 12157 12158 12159 12160 12161 12162 12163 12164 12165 12166 12167 12168 12169 12170 12171 12172 12173 12174 12175 12176 12177 12178 12179 12180 12181 12182 12183 12184 12185 12186 12187 12188 12189 12190 12191 12192 12193 12194 12195 12196 12197 12198 12199 12200 12201 12202 12203 12204 12205 12206 12207 12208 12209 12210 12211 12212 12213 12214 12215 12216 12217 12218 12219 12220 12221 12222 12223 12224 12225 12226 12227 12228 12229 12230 12231 12232 12233 12234 12235 12236 12237 12238 12239 12240 12241 12242 12243 12244 12245 12246 12247 12248 12249 12250 12251 12252 12253 12254 12255 12256 12257 12258 12259 12260 12261 12262 12263 12264 12265 12266 12267 12268 12269 12270 12271 12272 12273 12274 12275 12276 12277 12278 12279 12280 12281 12282 12283 12284 12285 12286 12287 12288 12289 12290 12291 12292 12293 12294 12295 12296 12297 12298 12299 12300 12301 12302 12303 12304 12305 12306 12307 12308 12309 12310 12311 12312 12313 12314 12315 12316 12317 12318 12319 12320 12321 12322 12323 12324 12325 12326 12327 12328 12329 12330 12331 12332 12333 12334 12335 12336 12337 12338 12339 12340 12341 12342 12343 12344 12345 12346 12347 12348 12349 12350 12351 12352 12353 12354 12355 12356 12357 12358 12359 12360 12361 12362 12363 12364 12365 12366 12367 12368 12369 12370 12371 12372 12373 12374 12375 12376 12377 12378 12379 12380 12381 12382 12383 12384 12385 12386 12387 12388 12389 12390 12391 12392 12393 12394 12395 12396 12397 12398 12399 12400 12401 12402 12403 12404 12405 12406 12407 12408 12409 12410 12411 12412 12413 12414 12415 12416 12417 12418 12419 12420 12421 12422 12423 12424 12425 12426 12427 12428 12429 12430 12431 12432 12433 12434 12435 12436 12437 12438 12439 12440 12441 12442 12443 12444 12445 12446 12447 12448 12449 12450 12451 12452 12453 12454 12455 12456 12457 12458 12459 12460 12461 12462 12463 12464 12465 12466 12467 12468 12469 12470 12471 12472 12473 12474 12475 12476 12477 12478 12479 12480 12481 12482 12483 12484 12485 12486 12487 12488 12489 12490 12491 12492 12493 12494 12495 12496 12497 12498 12499 12500 12501 12502 12503 12504 12505 12506 12507 12508 12509 12510 12511 12512 12513 12514 12515 12516 12517 12518 12519 12520 12521 12522 12523 12524 12525 12526 12527 12528 12529 12530 12531 12532 12533 12534 12535 12536 12537 12538 12539 12540 12541 12542 12543 12544 12545 12546 12547 12548 12549 12550 12551 12552 12553 12554 12555 12556 12557 12558 12559 12560 12561 12562 12563 12564 12565 12566 12567 12568 12569 12570 12571 12572 12573 12574 12575 12576 12577 12578 12579 12580 12581 12582 12583 12584 12585 12586 12587 12588 12589 12590 12591 12592 12593 12594 12595 12596 12597 12598 12599 12600 12601 12602 12603 12604 12605 12606 12607 12608 12609 12610 12611 12612 12613 12614 12615 12616 12617 12618 12619 12620 12621 12622 12623 12624 12625 12626 12627 12628 12629 12630 12631 12632 12633 12634 12635 12636 12637 12638 12639 12640 12641 12642 12643 12644 12645 12646 12647 12648 12649 12650 12651 12652 12653 12654 12655 12656 12657 12658 12659 12660 12661 12662 12663 12664 12665 12666 12667 12668 12669 12670 12671 12672 12673 12674 12675 12676 12677 12678 12679 12680 12681 12682 12683 12684 12685 12686 12687 12688 12689 12690 12691 12692 12693 12694 12695 12696 12697 12698 12699 12700 12701 12702 12703 12704 12705 12706 12707 12708 12709 12710 12711 12712 12713 12714 12715 12716 12717 12718 12719 12720 12721 12722 12723 12724 12725 12726 12727 12728 12729 12730 12731 12732 12733 12734 12735 12736 12737 12738 12739 12740 12741 12742 12743 12744 12745 12746 12747 12748 12749 12750 12751 12752 12753 12754 12755 12756 12757 12758 12759 12760 12761 12762 12763 12764 12765 12766 12767 12768 12769 12770 12771 12772 12773 12774 12775 12776 12777 12778 12779 12780 12781 12782 12783 12784 12785 12786 12787 12788 12789 12790 12791 12792 12793 12794 12795 12796 12797 12798 12799 12800 12801 12802 12803 12804 12805 12806 12807 12808 12809 12810 12811 12812 12813 12814 12815 12816 12817 12818 12819 12820 12821 12822 12823 12824 12825 12826 12827 12828 12829 12830 12831 12832 12833 12834 12835 12836 12837 12838 12839 12840 12841 12842 12843 12844 12845 12846 12847 12848 12849 12850 12851 12852 12853 12854 12855 12856 12857 12858 12859 12860 12861 12862 12863 12864 12865 12866 12867 12868 12869 12870 12871 12872 12873 12874 12875 12876 12877 12878 12879 12880 12881 12882 12883 12884 12885 12886 12887 12888 12889 12890 12891 12892 12893 12894 12895 12896 12897 12898 12899 12900 12901 12902 12903 12904 12905 12906 12907 12908 12909 12910 12911 12912 12913 12914 12915 12916 12917 12918 12919 12920 12921 12922 12923 12924 12925 12926 12927 12928 12929 12930 12931 12932 12933 12934 12935 12936 12937 12938 12939 12940 12941 12942 12943 12944 12945 12946 12947 12948 12949 12950 12951 12952 12953 12954 12955 12956 12957 12958 12959 12960 12961 12962 12963 12964 12965 12966 12967 12968 12969 12970 12971 12972 12973 12974 12975 12976 12977 12978 12979 12980 12981 12982 12983 12984 12985 12986 12987 12988 12989 12990 12991 12992 12993 12994 12995 12996 12997 12998 12999 13000 13001 13002 13003 13004 13005 13006 13007 13008 13009 13010 13011 13012 13013 13014 13015 13016 13017 13018 13019 13020 13021 13022 13023 13024 13025 13026 13027 13028 13029 13030 13031 13032 13033 13034 13035 13036 13037 13038 13039 13040 13041 13042 13043 13044 13045 13046 13047 13048 13049 13050 13051 13052 13053 13054 13055 13056 13057 13058 13059 13060 13061 13062 13063 13064 13065 13066 13067 13068 13069 13070 13071 13072 13073 13074 13075 13076 13077 13078 13079 13080 13081 13082 13083 13084 13085 13086 13087 13088 13089 13090 13091 13092 13093 13094 13095 13096 13097 13098 13099 13100 13101 13102 13103 13104 13105 13106 13107 13108 13109 13110 13111 13112 13113 13114 13115 13116 13117 13118 13119 13120 13121 13122 13123 13124 13125 13126 13127 13128 13129 13130 13131 13132 13133 13134 13135 13136 13137 13138 13139 13140 13141 13142 13143 13144 13145 13146 13147 13148 13149 13150 13151 13152 13153 13154 13155 13156 13157 13158 13159 13160 13161 13162 13163 13164 13165 13166 13167 13168 13169 13170 13171 13172 13173 13174 13175 13176 13177 13178 13179 13180 13181 13182 13183 13184 13185 13186 13187 13188 13189 13190 13191 13192 13193 13194 13195 13196 13197 13198 13199 13200 13201 13202 13203 13204 13205 13206 13207 13208 13209 13210 13211 13212 13213 13214 13215 13216 13217 13218 13219 13220 13221 13222 13223 13224 13225 13226 13227 13228 13229 13230 13231 13232 13233 13234 13235 13236 13237 13238 13239 13240 13241 13242 13243 13244 13245 13246 13247 13248 13249 13250 13251 13252 13253 13254 13255 13256 13257 13258 13259 13260 13261 13262 13263 13264 13265 13266 13267 13268 13269 13270 13271 13272 13273 13274 13275 13276 13277 13278 13279 13280 13281 13282 13283 13284 13285 13286 13287 13288 13289 13290 13291 13292 13293 13294 13295 13296 13297 13298 13299 13300 13301 13302 13303 13304 13305 13306 13307 13308 13309 13310 13311 13312 13313 13314 13315 13316 13317 13318 13319 13320 13321 13322 13323 13324 13325 13326 13327 13328 13329 13330 13331 13332 13333 13334 13335 13336 13337 13338 13339 13340 13341 13342 13343 13344 13345 13346 13347 13348 13349 13350 13351 13352 13353 13354 13355 13356 13357 13358 13359 13360 13361 13362 13363 13364 13365 13366 13367 13368 13369 13370 13371 13372 13373 13374 13375 13376 13377 13378 13379 13380 13381 13382 13383 13384 13385 13386 13387 13388 13389 13390 13391 13392 13393 13394 13395 13396 13397 13398 13399 13400 13401 13402 13403 13404 13405 13406 13407 13408 13409 13410 13411 13412 13413 13414 13415 13416 13417 13418 13419 13420 13421 13422 13423 13424 13425 13426 13427 13428 13429 13430 13431 13432 13433 13434 13435 13436 13437 13438 13439 13440 13441 13442 13443 13444 13445 13446 13447 13448 13449 13450 13451 13452 13453 13454 13455 13456 13457 13458 13459 13460 13461 13462 13463 13464 13465 13466 13467 13468 13469 13470 13471 13472 13473 13474 13475 13476 13477 13478 13479 13480 13481 13482 13483 13484 13485 13486 13487 13488 13489 13490 13491 13492 13493 13494 13495 13496 13497 13498 13499 13500 13501 13502 13503 13504 13505 13506 13507 13508 13509 13510 13511 13512 13513 13514 13515 13516 13517 13518 13519 13520 13521 13522 13523 13524 13525 13526 13527 13528 13529 13530 13531 13532 13533 13534 13535 13536 13537 13538 13539 13540 13541 13542 13543 13544 13545 13546 13547 13548 13549 13550 13551 13552 13553 13554 13555 13556 13557 13558 13559 13560 13561 13562 13563 13564 13565 13566 13567 13568 13569 13570 13571 13572 13573 13574 13575 13576 13577 13578 13579 13580 13581 13582 13583 13584 13585 13586 13587 13588 13589 13590 13591 13592 13593 13594 13595 13596 13597 13598 13599 13600 13601 13602 13603 13604 13605 13606 13607 13608 13609 13610 13611 13612 13613 13614 13615 13616 13617 13618 13619 13620 13621 13622 13623 13624 13625 13626 13627 13628 13629 13630 13631 13632 13633 13634 13635 13636 13637 13638 13639 13640 13641 13642 13643 13644 13645 13646 13647 13648 13649 13650 13651 13652 13653 13654 13655 13656 13657 13658 13659 13660 13661 13662 13663 13664 13665 13666 13667 13668 13669 13670 13671 13672 13673 13674 13675 13676 13677 13678 13679 13680 13681 13682 13683 13684 13685 13686 13687 13688 13689 13690 13691 13692 13693 13694 13695 13696 13697 13698 13699 13700 13701 13702 13703 13704 13705 13706 13707 13708 13709 13710 13711 13712 13713 13714 13715 13716 13717 13718 13719 13720 13721 13722 13723 13724 13725 13726 13727 13728 13729 13730 13731 13732 13733 13734 13735 13736 13737 13738 13739 13740 13741 13742 13743 13744 13745 13746 13747 13748 13749 13750 13751 13752 13753 13754 13755 13756 13757 13758 13759 13760 13761 13762 13763 13764 13765 13766 13767 13768 13769 13770 13771 13772 13773 13774 13775 13776 13777 13778 13779 13780 13781 13782 13783 13784 13785 13786 13787 13788 13789 13790 13791 13792 13793 13794 13795 13796 13797 13798 13799 13800 13801 13802 13803 13804 13805 13806 13807 13808 13809 13810 13811 13812 13813 13814 13815 13816 13817 13818 13819 13820 13821 13822 13823 13824 13825 13826 13827 13828 13829 13830 13831 13832 13833 13834 13835 13836 13837 13838 13839 13840 13841 13842 13843 13844 13845 13846 13847 13848 13849 13850 13851 13852 13853 13854 13855 13856 13857 13858 13859 13860 13861 13862 13863 13864 13865 13866 13867 13868 13869 13870 13871 13872 13873 13874 13875 13876 13877 13878 13879 13880 13881 13882 13883 13884 13885 13886 13887 13888 13889 13890 13891 13892 13893 13894 13895 13896 13897 13898 13899 13900 13901 13902 13903 13904 13905 13906 13907 13908 13909 13910 13911 13912 13913 13914 13915 13916 13917 13918 13919 13920 13921 13922 13923 13924 13925 13926 13927 13928 13929 13930 13931 13932 13933 13934 13935 13936 13937 13938 13939 13940 13941 13942 13943 13944 13945 13946 13947 13948 13949 13950 13951 13952 13953 13954 13955 13956 13957 13958 13959 13960 13961 13962 13963 13964 13965 13966 13967 13968 13969 13970 13971 13972 13973 13974 13975 13976 13977 13978 13979 13980 13981 13982 13983 13984 13985 13986 13987 13988 13989 13990 13991 13992 13993 13994 13995 13996 13997 13998 13999 14000 14001 14002 14003 14004 14005 14006 14007 14008 14009 14010 14011 14012 14013 14014 14015 14016 14017 14018 14019 14020 14021 14022 14023 14024 14025 14026 14027 14028 14029 14030 14031 14032 14033 14034 14035 14036 14037 14038 14039 14040 14041 14042 14043 14044 14045 14046 14047 14048 14049 14050 14051 14052 14053 14054 14055 14056 14057 14058 14059 14060 14061 14062 14063 14064 14065 14066 14067 14068 14069 14070 14071 14072 14073 14074 14075 14076 14077 14078 14079 14080 14081 14082 14083 14084 14085 14086 14087 14088 14089 14090 14091 14092 14093 14094 14095 14096 14097 14098 14099 14100 14101 14102 14103 14104 14105 14106 14107 14108 14109 14110 14111 14112 14113 14114 14115 14116 14117 14118 14119 14120 14121 14122 14123 14124 14125 14126 14127 14128 14129 14130 14131 14132 14133 14134 14135 14136 14137 14138 14139 14140 14141 14142 14143 14144 14145 14146 14147 14148 14149 14150 14151 14152 14153 14154 14155 14156 14157 14158 14159 14160 14161 14162 14163 14164 14165 14166 14167 14168 14169 14170 14171 14172 14173 14174 14175 14176 14177 14178 14179 14180 14181 14182 14183 14184 14185 14186 14187 14188 14189 14190 14191 14192 14193 14194 14195 14196 14197 14198 14199 14200 14201 14202 14203 14204 14205 14206 14207 14208 14209 14210 14211 14212 14213 14214 14215 14216 14217 14218 14219 14220 14221 14222 14223 14224 14225 14226 14227 14228 14229 14230 14231 14232 14233 14234 14235 14236 14237 14238 14239 14240 14241 14242 14243 14244 14245 14246 14247 14248 14249 14250 14251 14252 14253 14254 14255 14256 14257 14258 14259 14260 14261 14262 14263 14264 14265 14266 14267 14268 14269 14270 14271 14272 14273 14274 14275 14276 14277 14278 14279 14280 14281 14282 14283 14284 14285 14286 14287 14288 14289 14290 14291 14292 14293 14294 14295 14296 14297 14298 14299 14300 14301 14302 14303 14304 14305 14306 14307 14308 14309 14310 14311 14312 14313 14314 14315 14316 14317 14318 14319 14320 14321 14322 14323 14324 14325 14326 14327 14328 14329 14330 14331 14332 14333 14334 14335 14336 14337 14338 14339 14340 14341 14342 14343 14344 14345 14346 14347 14348 14349 14350 14351 14352 14353 14354 14355 14356 14357 14358 14359 14360 14361 14362 14363 14364 14365 14366 14367 14368 14369 14370 14371 14372 14373 14374 14375 14376 14377 14378 14379 14380 14381 14382 14383 14384 14385 14386 14387 14388 14389 14390 14391 14392 14393 14394 14395 14396 14397 14398 14399 14400 14401 14402 14403 14404 14405 14406 14407 14408 14409 14410 14411 14412 14413 14414 14415 14416 14417 14418 14419 14420 14421 14422 14423 14424 14425 14426 14427 14428 14429 14430 14431 14432 14433 14434 14435 14436 14437 14438 14439 14440 14441 14442 14443 14444 14445 14446 14447 14448 14449 14450 14451 14452 14453 14454 14455 14456 14457 14458 14459 14460 14461 14462 14463 14464 14465 14466 14467 14468 14469 14470 14471 14472 14473 14474 14475 14476 14477 14478 14479 14480 14481 14482 14483 14484 14485 14486 14487 14488 14489 14490 14491 14492 14493 14494 14495 14496 14497 14498 14499 14500 14501 14502 14503 14504 14505 14506 14507 14508 14509 14510 14511 14512 14513 14514 14515 14516 14517 14518 14519 14520 14521 14522 14523 14524 14525 14526 14527 14528 14529 14530 14531 14532 14533 14534 14535 14536 14537 14538 14539 14540 14541 14542 14543 14544 14545 14546 14547 14548 14549 14550 14551 14552 14553 14554 14555 14556 14557 14558 14559 14560 14561 14562 14563 14564 14565 14566 14567 14568 14569 14570 14571 14572 14573 14574 14575 14576 14577 14578 14579 14580 14581 14582 14583 14584 14585 14586 14587 14588 14589 14590 14591 14592 14593 14594 14595 14596 14597 14598 14599 14600 14601 14602 14603 14604 14605 14606 14607 14608 14609 14610 14611 14612 14613 14614 14615 14616 14617 14618 // SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ #include <linux/fs.h> #include <linux/mm.h> #include <linux/cpu.h> #include <linux/smp.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/tick.h> #include <linux/sysfs.h> #include <linux/dcache.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/hugetlb.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> #include <linux/cgroup.h> #include <linux/perf_event.h> #include <linux/trace_events.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> #include <linux/module.h> #include <linux/mman.h> #include <linux/compat.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/namei.h> #include <linux/parser.h> #include <linux/sched/clock.h> #include <linux/sched/mm.h> #include <linux/proc_ns.h> #include <linux/mount.h> #include <linux/min_heap.h> #include <linux/highmem.h> #include <linux/pgtable.h> #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> #include "internal.h" #include <asm/irq_regs.h> typedef int (*remote_function_f)(void *); struct remote_function_call { struct task_struct *p; remote_function_f func; void *info; int ret; }; static void remote_function(void *data) { struct remote_function_call *tfc = data; struct task_struct *p = tfc->p; if (p) { /* -EAGAIN */ if (task_cpu(p) != smp_processor_id()) return; /* * Now that we're on right CPU with IRQs disabled, we can test * if we hit the right task without races. */ tfc->ret = -ESRCH; /* No such (running) process */ if (p != current) return; } tfc->ret = tfc->func(tfc->info); } /** * task_function_call - call a function on the cpu on which a task runs * @p: the task to evaluate * @func: the function to be called * @info: the function call argument * * Calls the function @func when the task is currently running. This might * be on the current CPU, which just calls the function directly. This will * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, .func = func, .info = info, .ret = -EAGAIN, }; int ret; for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); if (!ret) ret = data.ret; if (ret != -EAGAIN) break; cond_resched(); } return ret; } /** * cpu_function_call - call a function on the cpu * @cpu: target cpu to queue this function * @func: the function to be called * @info: the function call argument * * Calls the function @func on the remote cpu. * * returns: @func return value or -ENXIO when the cpu is offline */ static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, .func = func, .info = info, .ret = -ENXIO, /* No such CPU */ }; smp_call_function_single(cpu, remote_function, &data, 1); return data.ret; } enum event_type_t { EVENT_FLEXIBLE = 0x01, EVENT_PINNED = 0x02, EVENT_TIME = 0x04, EVENT_FROZEN = 0x08, /* see ctx_resched() for details */ EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, }; static inline void __perf_ctx_lock(struct perf_event_context *ctx) { raw_spin_lock(&ctx->lock); WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN); } static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __perf_ctx_lock(&cpuctx->ctx); if (ctx) __perf_ctx_lock(ctx); } static inline void __perf_ctx_unlock(struct perf_event_context *ctx) { /* * If ctx_sched_in() didn't again set any ALL flags, clean up * after ctx_sched_out() by clearing is_active. */ if (ctx->is_active & EVENT_FROZEN) { if (!(ctx->is_active & EVENT_ALL)) ctx->is_active = 0; else ctx->is_active &= ~EVENT_FROZEN; } raw_spin_unlock(&ctx->lock); } static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { if (ctx) __perf_ctx_unlock(ctx); __perf_ctx_unlock(&cpuctx->ctx); } #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { return READ_ONCE(event->owner) == TASK_TOMBSTONE; } static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); struct perf_event_context *perf_cpu_task_ctx(void) { lockdep_assert_irqs_disabled(); return this_cpu_ptr(&perf_cpu_context)->task_ctx; } /* * On task ctx scheduling... * * When !ctx->nr_events a task context will not be scheduled. This means * we can disable the scheduler hooks (for performance) without leaving * pending task ctx state. * * This however results in two special cases: * * - removing the last event from a task ctx; this is relatively straight * forward and is done in __perf_remove_from_context. * * - adding the first event to a task ctx; this is tricky because we cannot * rely on ctx->is_active and therefore cannot use event_function_call(). * See perf_install_in_context(). * * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. */ typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, struct perf_event_context *, void *); struct event_function_struct { struct perf_event *event; event_f func; void *data; }; static int event_function(void *info) { struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; lockdep_assert_irqs_disabled(); perf_ctx_lock(cpuctx, task_ctx); /* * Since we do the IPI call without holding ctx->lock things can have * changed, double check we hit the task we set out to hit. */ if (ctx->task) { if (ctx->task != current) { ret = -ESRCH; goto unlock; } /* * We only use event_function_call() on established contexts, * and event_function() is only ever called when active (or * rather, we'll have bailed in task_function_call() or the * above ctx->task != current test), therefore we must have * ctx->is_active here. */ WARN_ON_ONCE(!ctx->is_active); /* * And since we have ctx->is_active, cpuctx->task_ctx must * match. */ WARN_ON_ONCE(task_ctx != ctx); } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } efs->func(event, cpuctx, ctx, efs->data); unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ struct perf_cpu_context *cpuctx; struct event_function_struct efs = { .event = event, .func = func, .data = data, }; if (!event->parent) { /* * If this is a !child event, we must hold ctx::mutex to * stabilize the event->ctx relation. See * perf_event_ctx_lock(). */ lockdep_assert_held(&ctx->mutex); } if (!task) { cpu_function_call(event->cpu, event_function, &efs); return; } if (task == TASK_TOMBSTONE) return; again: if (!task_function_call(task, event_function, &efs)) return; local_irq_disable(); cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); /* * Reload the task pointer, it might have been changed by * a concurrent perf_event_context_sched_out(). */ task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (ctx->is_active) { perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); goto again; } func(event, NULL, ctx, data); unlock: perf_ctx_unlock(cpuctx, ctx); local_irq_enable(); } /* * Similar to event_function_call() + event_function(), but hard assumes IRQs * are already disabled and we're on the right CPU. */ static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; lockdep_assert_irqs_disabled(); if (task) { if (task == TASK_TOMBSTONE) return; task_ctx = ctx; } perf_ctx_lock(cpuctx, task_ctx); task = ctx->task; if (task == TASK_TOMBSTONE) goto unlock; if (task) { /* * We must be either inactive or active and the right task, * otherwise we're screwed, since we cannot IPI to somewhere * else. */ if (ctx->is_active) { if (WARN_ON_ONCE(task != current)) goto unlock; if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) goto unlock; } } else { WARN_ON_ONCE(&cpuctx->ctx != ctx); } func(event, cpuctx, ctx, data); unlock: perf_ctx_unlock(cpuctx, task_ctx); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ PERF_FLAG_FD_CLOEXEC) /* * branch priv levels that need permission checks */ #define PERF_SAMPLE_BRANCH_PERM_PLM \ (PERF_SAMPLE_BRANCH_KERNEL |\ PERF_SAMPLE_BRANCH_HV) /* * perf_sched_events : >0 events exist */ static void perf_sched_delayed(struct work_struct *work); DEFINE_STATIC_KEY_FALSE(perf_sched_events); static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; static cpumask_var_t perf_online_core_mask; static cpumask_var_t perf_online_die_mask; static cpumask_var_t perf_online_cluster_mask; static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_event_paranoid __read_mostly = 2; /* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */ static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* * max perf event sample rate */ #define DEFAULT_MAX_SAMPLE_RATE 100000 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE) #define DEFAULT_CPU_TIME_MAX_PERCENT 25 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; static int perf_sample_allowed_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; static void update_perf_cpu_limits(void) { u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; tmp = div_u64(tmp, 100); if (!tmp) tmp = 1; WRITE_ONCE(perf_sample_allowed_ns, tmp); } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); return 0; } static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; if (sysctl_perf_cpu_time_max_percent == 100 || sysctl_perf_cpu_time_max_percent == 0) { printk(KERN_WARNING "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); WRITE_ONCE(perf_sample_allowed_ns, 0); } else { update_perf_cpu_limits(); } return 0; } static const struct ctl_table events_core_sysctl_table[] = { /* * User-space relies on this file as a feature check for * perf_events being enabled. It's an ABI, do not remove! */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid, .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "perf_event_mlock_kb", .data = &sysctl_perf_event_mlock, .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "perf_event_max_sample_rate", .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = perf_event_max_sample_rate_handler, .extra1 = SYSCTL_ONE, }, { .procname = "perf_cpu_time_max_percent", .data = &sysctl_perf_cpu_time_max_percent, .maxlen = sizeof(sysctl_perf_cpu_time_max_percent), .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE_HUNDRED, }, }; static int __init init_events_core_sysctls(void) { register_sysctl_init("kernel", events_core_sysctl_table); return 0; } core_initcall(init_events_core_sysctls); /* * perf samples are done in some very critical code paths (NMIs). * If they take too much CPU time, the system can lock up and not * get any real work done. This will drop the sample rate when * we detect that events are taking too long. */ #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); static u64 __report_avg; static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { u64 max_len = READ_ONCE(perf_sample_allowed_ns); u64 running_len; u64 avg_len; u32 max; if (max_len == 0) return; /* Decay the counter by 1 average sample. */ running_len = __this_cpu_read(running_sample_length); running_len -= running_len/NR_ACCUMULATED_SAMPLES; running_len += sample_len_ns; __this_cpu_write(running_sample_length, running_len); /* * Note: this will be biased artificially low until we have * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ avg_len = running_len/NR_ACCUMULATED_SAMPLES; if (avg_len <= max_len) return; __report_avg = avg_len; __report_allowed = max_len; /* * Compute a throttle threshold 25% below the current duration. */ avg_len += avg_len / 4; max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; if (avg_len < max) max /= (u32)avg_len; else max = 1; WRITE_ONCE(perf_sample_allowed_ns, avg_len); WRITE_ONCE(max_samples_per_tick, max); sysctl_perf_event_sample_rate = max * HZ; perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } static atomic64_t perf_event_id; static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } static inline u64 perf_clock(void) { return local_clock(); } static inline u64 perf_event_clock(struct perf_event *event) { return event->clock(); } /* * State based event timekeeping... * * The basic idea is to use event->state to determine which (if any) time * fields to increment with the current delta. This means we only need to * update timestamps when we change state or when they are explicitly requested * (read). * * Event groups make things a little more complicated, but not terribly so. The * rules for a group are that if the group leader is OFF the entire group is * OFF, irrespective of what the group member states are. This results in * __perf_effective_state(). * * A further ramification is that when a group leader flips between OFF and * !OFF, we need to update all group member times. * * * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we * need to make sure the relevant context time is updated before we try and * update our timestamps. */ static __always_inline enum perf_event_state __perf_effective_state(struct perf_event *event) { struct perf_event *leader = event->group_leader; if (leader->state <= PERF_EVENT_STATE_OFF) return leader->state; return event->state; } static __always_inline void __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running) { enum perf_event_state state = __perf_effective_state(event); u64 delta = now - event->tstamp; *enabled = event->total_time_enabled; if (state >= PERF_EVENT_STATE_INACTIVE) *enabled += delta; *running = event->total_time_running; if (state >= PERF_EVENT_STATE_ACTIVE) *running += delta; } static void perf_event_update_time(struct perf_event *event) { u64 now = perf_event_time(event); __perf_update_times(event, now, &event->total_time_enabled, &event->total_time_running); event->tstamp = now; } static void perf_event_update_sibling_time(struct perf_event *leader) { struct perf_event *sibling; for_each_sibling_event(sibling, leader) perf_event_update_time(sibling); } static void perf_event_set_state(struct perf_event *event, enum perf_event_state state) { if (event->state == state) return; perf_event_update_time(event); /* * If a group leader gets enabled/disabled all its siblings * are affected too. */ if ((event->state < 0) ^ (state < 0)) perf_event_update_sibling_time(event); WRITE_ONCE(event->state, state); } /* * UP store-release, load-acquire */ #define __store_release(ptr, val) \ do { \ barrier(); \ WRITE_ONCE(*(ptr), (val)); \ } while (0) #define __load_acquire(ptr) \ ({ \ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ barrier(); \ ___p; \ }) #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ if (_cgroup && !_epc->nr_cgroups) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_disable(pmu_ctx->pmu); } static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; for_each_epc(pmu_ctx, ctx, NULL, cgroup) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) return true; /* wants specific cgroup scope but @cpuctx isn't associated with any */ if (!cpuctx->cgrp) return false; /* * Cgroup scoping is recursive. An event enabled for a cgroup is * also enabled for all its descendant cgroups. If @cpuctx's * cgroup is a descendant of @event's (the test covers identity * case), it's a match. */ return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, event->cgrp->css.cgroup); } static inline void perf_detach_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); event->cgrp = NULL; } static inline int is_cgroup_event(struct perf_event *event) { return event->cgrp != NULL; } static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); return t->time; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) return t->time; now += READ_ONCE(t->timeoffset); return now; } static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) { if (adv) info->time += now - info->timestamp; info->timestamp = now; /* * see update_context_time() */ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; struct perf_cgroup_info *info; if (cgrp) { u64 now = perf_clock(); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, now, true); if (final) __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and * when we know the cgroup is pinned (css_get) */ if (!is_cgroup_event(event)) return; info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ if (info->active) __update_cgrp_time(info, perf_clock(), true); } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; /* * ctx->lock held by caller * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ if (!cgrp) return; WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); __update_cgrp_time(info, ctx->timestamp, false); __store_release(&info->active, 1); } } /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; /* * cpuctx->cgrp is set when the first cgroup event enabled, * and is cleared when the last cgroup event disabled. */ if (READ_ONCE(cpuctx->cgrp) == NULL) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in * ctx_sched_out() */ cpuctx->cgrp = cgrp; /* * set cgrp before ctxsw in to allow * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { struct perf_cpu_context *cpuctx; struct perf_event **storage; int cpu, heap_size, ret = 0; /* * Allow storage to have sufficient space for an iterator for each * possibly nested cgroup plus an iterator for events with no cgroup. */ for (heap_size = 1; css; css = css->parent) heap_size++; for_each_possible_cpu(cpu) { cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; storage = kmalloc_node(heap_size * sizeof(struct perf_event *), GFP_KERNEL, cpu_to_node(cpu)); if (!storage) { ret = -ENOMEM; break; } raw_spin_lock_irq(&cpuctx->ctx.lock); if (cpuctx->heap_size < heap_size) { swap(cpuctx->heap, storage); if (storage == cpuctx->heap_default) storage = NULL; cpuctx->heap_size = heap_size; } raw_spin_unlock_irq(&cpuctx->ctx.lock); kfree(storage); } return ret; } static inline int perf_cgroup_connect(int fd, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { struct perf_cgroup *cgrp; struct cgroup_subsys_state *css; CLASS(fd, f)(fd); int ret = 0; if (fd_empty(f)) return -EBADF; css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) return PTR_ERR(css); ret = perf_cgroup_ensure_storage(event, css); if (ret) return ret; cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; /* * all events in a group must monitor * the same cgroup because a task belongs * to only one perf cgroup at a time */ if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; } return ret; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups++; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (ctx->nr_cgroups++) return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; if (!is_cgroup_event(event)) return; event->pmu_ctx->nr_cgroups--; /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); if (--ctx->nr_cgroups) return; cpuctx->cgrp = NULL; } #else /* !CONFIG_CGROUP_PERF */ static inline bool perf_cgroup_match(struct perf_event *event) { return true; } static inline void perf_detach_cgroup(struct perf_event *event) {} static inline int is_cgroup_event(struct perf_event *event) { return 0; } static inline void update_cgrp_time_from_event(struct perf_event *event) { } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { } static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, struct perf_event_attr *attr, struct perf_event *group_leader) { return -EINVAL; } static inline void perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } static inline u64 perf_cgroup_event_time(struct perf_event *event) { return 0; } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { } static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } static void perf_cgroup_switch(struct task_struct *task) { } #endif /* * set default to be dependent on timer tick just * like original code */ #define PERF_CPU_HRTIMER (1000 / HZ) /* * function must be called with interrupts disabled */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); rotations = perf_rotate_context(cpc); raw_spin_lock(&cpc->hrtimer_lock); if (rotations) hrtimer_forward_now(hr, cpc->hrtimer_interval); else cpc->hrtimer_active = 0; raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { struct hrtimer *timer = &cpc->hrtimer; struct pmu *pmu = cpc->epc.pmu; u64 interval; /* * check default is sane, if not set then force to * default interval (1/tick) */ interval = pmu->hrtimer_interval_ms; if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); } static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); if (!cpc->hrtimer_active) { cpc->hrtimer_active = 1; hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } static int perf_mux_hrtimer_restart_ipi(void *arg) { return perf_mux_hrtimer_restart(arg); } static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu) { return *this_cpu_ptr(pmu->cpu_pmu_context); } void perf_pmu_disable(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; if (!(*count)++) pmu->pmu_disable(pmu); } void perf_pmu_enable(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; if (!--(*count)) pmu->pmu_enable(pmu); } static void perf_assert_pmu_disabled(struct pmu *pmu) { int *count = &this_cpc(pmu)->pmu_disable_count; WARN_ON_ONCE(*count == 0); } static inline void perf_pmu_read(struct perf_event *event) { if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); } static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) { struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } static void put_ctx(struct perf_event_context *ctx) { if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } } /* * Because of perf_event::ctx migration in sys_perf_event_open::move_group and * perf_pmu_migrate_context() we need some magic. * * Those places that change perf_event::ctx will hold both * perf_event_ctx::mutex of the 'old' and 'new' ctx value. * * Lock ordering is by mutex address. There are two other sites where * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] * perf_event_exit_event() * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() * inherit_group() * inherit_event() * perf_event_alloc() * perf_init_event() * perf_try_init_event() [ child , 1 ] * * While it appears there is an obvious deadlock here -- the parent and child * nesting levels are inverted between the two. This is in fact safe because * life-time rules separate them. That is an exiting task cannot fork, and a * spawning task cannot (yet) exit. * * But remember that these are parent<->child context relations, and * migration does not affect children, therefore these two orderings should not * interact. * * The change in perf_event::ctx does not affect children (as claimed above) * because the sys_perf_event_open() case will install a new event and break * the ctx parent<->child relation, and perf_pmu_migrate_context() is only * concerned with cpuctx and that doesn't have children. * * The places that change perf_event::ctx will issue: * * perf_remove_from_context(); * synchronize_rcu(); * perf_install_in_context(); * * to affect the change. The remove_from_context() + synchronize_rcu() should * quiesce the event, after which we can install it in the new location. This * means that only external vectors (perf_fops, prctl) can perturb the event * while in transit. Therefore all such accessors should also acquire * perf_event_context::mutex to serialize against this. * * However; because event->ctx can change while we're waiting to acquire * ctx->mutex we must be careful and use the below perf_event_ctx_lock() * function. * * Lock order: * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; * perf_event_context::lock * mmap_lock * perf_event::mmap_mutex * perf_buffer::aux_mutex * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) { struct perf_event_context *ctx; again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); mutex_lock_nested(&ctx->mutex, nesting); if (event->ctx != ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); goto again; } return ctx; } static inline struct perf_event_context * perf_event_ctx_lock(struct perf_event *event) { return perf_event_ctx_lock_nested(event, 0); } static void perf_event_ctx_unlock(struct perf_event *event, struct perf_event_context *ctx) { mutex_unlock(&ctx->mutex); put_ctx(ctx); } /* * This must be done under the ctx->lock, such as to serialize against * context_equiv(), therefore we cannot call put_ctx() since that might end up * calling scheduler related locks and ctx->lock nests inside those. */ static __must_check struct perf_event_context * unclone_ctx(struct perf_event_context *ctx) { struct perf_event_context *parent_ctx = ctx->parent_ctx; lockdep_assert_held(&ctx->lock); if (parent_ctx) ctx->parent_ctx = NULL; ctx->generation++; return parent_ctx; } static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p, enum pid_type type) { u32 nr; /* * only top level events have the pid namespace they were created in */ if (event->parent) event = event->parent; nr = __task_pid_nr_ns(p, type, event->ns); /* avoid -1 if it is idle thread or runs in another ns */ if (!nr && !pid_alive(p)) nr = -1; return nr; } static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_TGID); } static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { return perf_event_pid_type(event, p, PIDTYPE_PID); } /* * If we inherit events we want to return the parent event id * to userspace. */ static u64 primary_event_id(struct perf_event *event) { u64 id = event->id; if (event->parent) id = event->parent->id; return id; } /* * Get the perf_event_context for a task and lock it. * * This has to cope with the fact that until it is locked, * the context could get moved to another task. */ static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read * side critical section has interrupts disabled. */ local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry * if so. If we locked the right context, then it * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); goto retry; } if (ctx->task == TASK_TOMBSTONE || !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); if (!ctx) local_irq_restore(*flags); return ctx; } /* * Get the context for a task and increment its pin_count so it * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ static struct perf_event_context * perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } return ctx; } static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } /* * Update the record of the current time in a context. */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); lockdep_assert_held(&ctx->lock); if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; /* * The above: time' = time + (now - timestamp), can be re-arranged * into: time` = now + (time - timestamp), which gives a single value * offset to compute future time without locks on. * * See perf_event_time_now(), which can be used from NMI context where * it's (obviously) not possible to acquire ctx->lock in order to read * both the above values in a consistent manner. */ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time(event); return ctx->time; } static u64 perf_event_time_now(struct perf_event *event, u64 now) { struct perf_event_context *ctx = event->ctx; if (unlikely(!ctx)) return 0; if (is_cgroup_event(event)) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) return ctx->time; now += READ_ONCE(ctx->timeoffset); return now; } static enum event_type_t get_event_type(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; enum event_type_t event_type; lockdep_assert_held(&ctx->lock); /* * It's 'group type', really, because if our group leader is * pinned, so are we. */ if (event->group_leader != event) event = event->group_leader; event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; if (!ctx->task) event_type |= EVENT_CPU; return event_type; } /* * Helper function to initialize event group nodes. */ static void init_event_group(struct perf_event *event) { RB_CLEAR_NODE(&event->group_node); event->group_index = 0; } /* * Extract pinned or flexible groups from the context * based on event attrs bits. */ static struct perf_event_groups * get_event_groups(struct perf_event *event, struct perf_event_context *ctx) { if (event->attr.pinned) return &ctx->pinned_groups; else return &ctx->flexible_groups; } /* * Helper function to initializes perf_event_group trees. */ static void perf_event_groups_init(struct perf_event_groups *groups) { groups->tree = RB_ROOT; groups->index = 0; } static inline struct cgroup *event_cgroup(const struct perf_event *event) { struct cgroup *cgroup = NULL; #ifdef CONFIG_CGROUP_PERF if (event->cgrp) cgroup = event->cgrp->css.cgroup; #endif return cgroup; } /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ static __always_inline int perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, const struct cgroup *left_cgroup, const u64 left_group_index, const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; if (left_pmu) { if (left_pmu < right->pmu_ctx->pmu) return -1; if (left_pmu > right->pmu_ctx->pmu) return 1; } #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); if (left_cgroup != right_cgroup) { if (!left_cgroup) { /* * Left has no cgroup but right does, no * cgroups come first. */ return -1; } if (!right_cgroup) { /* * Right has no cgroup but left does, no * cgroups come first. */ return 1; } /* Two dissimilar cgroups, order by id. */ if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) return -1; return 1; } } #endif if (left_group_index < right->group_index) return -1; if (left_group_index > right->group_index) return 1; return 0; } #define __node_2_pe(node) \ rb_entry((node), struct perf_event, group_node) static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; struct pmu *pmu; struct cgroup *cgroup; }; static inline int __group_cmp(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); } static inline int __group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) { const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), b->group_index, b); } /* * Insert @event into @groups' tree; using * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { event->group_index = ++groups->index; rb_add(&event->group_node, &groups->tree, __group_less); } /* * Helper function to insert event into the pinned or flexible groups. */ static void add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_insert(groups, event); } /* * Delete a group from a tree. */ static void perf_event_groups_delete(struct perf_event_groups *groups, struct perf_event *event) { WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) || RB_EMPTY_ROOT(&groups->tree)); rb_erase(&event->group_node, &groups->tree); init_event_group(event); } /* * Helper function to delete event from its groups. */ static void del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_groups *groups; groups = get_event_groups(event, ctx); perf_event_groups_delete(groups, event); } /* * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; node = rb_find_first(&key, &groups->tree, __group_cmp); if (node) return __node_2_pe(node); return NULL; } static struct perf_event * perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; next = rb_next_match(&key, &event->group_node, __group_cmp); if (next) return __node_2_pe(next); return NULL; } #define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ event; event = perf_event_groups_next(event, pmu)) /* * Iterate through the whole groups tree. */ #define perf_event_groups_for_each(event, groups) \ for (event = rb_entry_safe(rb_first(&((groups)->tree)), \ typeof(*event), group_node); event; \ event = rb_entry_safe(rb_next(&event->group_node), \ typeof(*event), group_node)) /* * Does the event attribute request inherit with PERF_SAMPLE_READ */ static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr) { return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ); } /* * Add an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; event->tstamp = perf_event_time(event); /* * If we're a stand alone event or group leader, we go to the context * list, group events are kept attached to the group so that * perf_group_detach can, at all times, locate all siblings. */ if (event->group_leader == event) { event->group_caps = event->event_caps; add_event_to_groups(event, ctx); } list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; if (has_inherit_and_sample_read(&event->attr)) local_inc(&ctx->nr_no_switch_fast); if (event->state > PERF_EVENT_STATE_OFF) perf_cgroup_event_enable(event, ctx); ctx->generation++; event->pmu_ctx->nr_events++; } /* * Initialize event state based on the perf_event_attr::disabled. */ static inline void perf_event__state_init(struct perf_event *event) { event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF : PERF_EVENT_STATE_INACTIVE; } static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } /* * Since perf_event_validate_size() limits this to 16k and inhibits * adding more siblings, this will never overflow. */ return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; u16 size = 0; if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); if (sample_type & PERF_SAMPLE_ADDR) size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; if (sample_type & PERF_SAMPLE_DATA_SRC) size += sizeof(data->data_src.val); if (sample_type & PERF_SAMPLE_TRANSACTION) size += sizeof(data->txn); if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) size += sizeof(data->code_page_size); event->header_size = size; } /* * Called at perf_event creation and when events are attached/detached from a * group. */ static void perf_event__header_size(struct perf_event *event) { event->read_size = __perf_event_read_size(event->attr.read_format, event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; u64 sample_type = event->attr.sample_type; u16 size = 0; if (sample_type & PERF_SAMPLE_TID) size += sizeof(data->tid_entry); if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); if (sample_type & PERF_SAMPLE_IDENTIFIER) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); if (sample_type & PERF_SAMPLE_STREAM_ID) size += sizeof(data->stream_id); if (sample_type & PERF_SAMPLE_CPU) size += sizeof(data->cpu_entry); event->id_header_size = size; } /* * Check that adding an event to the group does not result in anybody * overflowing the 64k event limit imposed by the output buffer. * * Specifically, check that the read_size for the event does not exceed 16k, * read_size being the one term that grows with groups size. Since read_size * depends on per-event read_format, also (re)check the existing events. * * This leaves 48k for the constant size fields and things like callchains, * branch stacks and register sets. */ static bool perf_event_validate_size(struct perf_event *event) { struct perf_event *sibling, *group_leader = event->group_leader; if (__perf_event_read_size(event->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; if (__perf_event_read_size(group_leader->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; /* * When creating a new group leader, group_leader->ctx is initialized * after the size has been validated, but we cannot safely use * for_each_sibling_event() until group_leader->ctx is set. A new group * leader cannot have any siblings yet, so we can safely skip checking * the non-existent siblings. */ if (event == group_leader) return true; for_each_sibling_event(sibling, group_leader) { if (__perf_event_read_size(sibling->attr.read_format, group_leader->nr_siblings + 1) > 16*1024) return false; } return true; } static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; lockdep_assert_held(&event->ctx->lock); /* * We can have double attach due to group movement (move_group) in * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) return; WARN_ON_ONCE(group_leader->ctx != event->ctx); group_leader->group_caps &= event->event_caps; list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; group_leader->group_generation++; perf_event__header_size(group_leader); for_each_sibling_event(pos, group_leader) perf_event__header_size(pos); } /* * Remove an event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; if (has_inherit_and_sample_read(&event->attr)) local_dec(&ctx->nr_no_switch_fast); list_del_rcu(&event->event_entry); if (event->group_leader == event) del_event_from_groups(event, ctx); /* * If event was in error state, then keep it * that way, otherwise bogus counts will be * returned on read(). The only way to get out * of error state is by explicit re-enabling * of the event */ if (event->state > PERF_EVENT_STATE_OFF) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); } ctx->generation++; event->pmu_ctx->nr_events--; } static int perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) { if (!has_aux(aux_event)) return 0; if (!event->pmu->aux_output_match) return 0; return event->pmu->aux_output_match(aux_event); } static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *iter; /* * If event uses aux_event tear down the link */ if (event->aux_event) { iter = event->aux_event; event->aux_event = NULL; put_event(iter); return; } /* * If the event is an aux_event, tear down all links to * it from other events. */ for_each_sibling_event(iter, event->group_leader) { if (iter->aux_event != event) continue; iter->aux_event = NULL; put_event(event); /* * If it's ACTIVE, schedule it out and put it into ERROR * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } static bool perf_need_aux_event(struct perf_event *event) { return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, struct perf_event *group_leader) { /* * Our group leader must be an aux event if we want to be * an aux_output. This way, the aux event will precede its * aux_output events in the group, and therefore will always * schedule first. */ if (!group_leader) return 0; /* * aux_output and aux_sample_size are mutually exclusive. */ if (event->attr.aux_output && event->attr.aux_sample_size) return 0; if (event->attr.aux_output && !perf_aux_output_match(event, group_leader)) return 0; if ((event->attr.aux_pause || event->attr.aux_resume) && !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) return 0; if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; if (!atomic_long_inc_not_zero(&group_leader->refcount)) return 0; /* * Link aux_outputs to their aux event; this is undone in * perf_group_detach() by perf_put_aux_event(). When the * group in torn down, the aux_output events loose their * link to the aux_event and can't schedule any more. */ event->aux_event = group_leader; return 1; } static inline struct list_head *get_event_list(struct perf_event *event) { return event->attr.pinned ? &event->pmu_ctx->pinned_active : &event->pmu_ctx->flexible_active; } /* * Events that have PERF_EV_CAP_SIBLING require being part of a group and * cannot exist on their own, schedule them out and move them into the ERROR * state. Also see _perf_event_enable(), it will not be able to recover * this ERROR state. */ static inline void perf_remove_sibling_event(struct perf_event *event) { event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct perf_event *sibling, *tmp; struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->lock); /* * We can have double detach due to exit/hot-unplug + close. */ if (!(event->attach_state & PERF_ATTACH_GROUP)) return; event->attach_state &= ~PERF_ATTACH_GROUP; perf_put_aux_event(event); /* * If this is a sibling, remove it from its group. */ if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; event->group_leader->group_generation++; goto out; } /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { if (sibling->event_caps & PERF_EV_CAP_SIBLING) perf_remove_sibling_event(sibling); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); /* Inherit group flags from the previous leader */ sibling->group_caps = event->group_caps; if (sibling->attach_state & PERF_ATTACH_CONTEXT) { add_event_to_groups(sibling, event->ctx); if (sibling->state == PERF_EVENT_STATE_ACTIVE) list_add_tail(&sibling->active_list, get_event_list(sibling)); } WARN_ON_ONCE(sibling->ctx != event->ctx); } out: for_each_sibling_event(tmp, leader) perf_event__header_size(tmp); perf_event__header_size(leader); } static void sync_child_event(struct perf_event *child_event); static void perf_child_detach(struct perf_event *event) { struct perf_event *parent_event = event->parent; if (!(event->attach_state & PERF_ATTACH_CHILD)) return; event->attach_state &= ~PERF_ATTACH_CHILD; if (WARN_ON_ONCE(!parent_event)) return; lockdep_assert_held(&parent_event->child_mutex); sync_child_event(event); list_del_init(&event->child_list); } static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; } static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && perf_cgroup_match(event); } static void event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; // XXX cpc serialization, probably per-cpu IRQ disabled WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state != PERF_EVENT_STATE_ACTIVE) return; /* * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but * we can schedule events _OUT_ individually through things like * __perf_remove_from_context(). */ list_del_init(&event->active_list); perf_pmu_disable(event->pmu); event->pmu->del(event, 0); event->oncpu = -1; if (event->pending_disable) { event->pending_disable = 0; perf_cgroup_event_disable(event, ctx); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); if (!is_software_event(event)) cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq--; epc->nr_freq--; } if (event->attr.exclusive || !cpc->active_oncpu) cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) event_sched_out(event, ctx); } static inline void __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx, final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { __ctx_time_update(cpuctx, ctx, false); } /* * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock(). */ static inline void ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { ctx_time_update(cpuctx, ctx); if (ctx->is_active & EVENT_TIME) ctx->is_active |= EVENT_FROZEN; } static inline void ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; update_context_time(ctx); update_cgrp_time_from_event(event); } } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL #define DETACH_DEAD 0x04UL #define DETACH_EXIT 0x08UL /* * Cross CPU call to remove a performance event * * We disable the event on the hardware level first. After that we * remove it from the context list. */ static void __perf_remove_from_context(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; enum perf_event_state state = PERF_EVENT_STATE_OFF; unsigned long flags = (unsigned long)info; ctx_time_update(cpuctx, ctx); /* * Ensure event_sched_out() switches to OFF, at the very least * this avoids raising perf_pending_task() at this time. */ if (flags & DETACH_EXIT) state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_DEAD) { event->pending_disable = 1; state = PERF_EVENT_STATE_DEAD; } event_sched_out(event, ctx); perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); if (!pmu_ctx->nr_events) { pmu_ctx->rotate_necessary = 0; if (ctx->task && ctx->is_active) { struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; } } } /* * Remove the event from a task's (or a CPU's) list of events. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { struct perf_event_context *ctx = event->ctx; lockdep_assert_held(&ctx->mutex); /* * Because of perf_event_exit_task(), perf_remove_from_context() ought * to work in the face of TASK_TOMBSTONE, unlike every other * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); if (!ctx->is_active) { __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ static void __perf_event_disable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { if (event->state < PERF_EVENT_STATE_INACTIVE) return; perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); if (event == event->group_leader) group_sched_out(event, ctx); else event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); perf_pmu_enable(event->pmu_ctx->pmu); } /* * Disable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); return; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_disable, NULL); } void perf_event_disable_local(struct perf_event *event) { event_function_local(event, __perf_event_disable, NULL); } /* * Strictly speaking kernel users cannot create groups and therefore this * interface does not need the perf_event_ctx_lock() magic. */ void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_disable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); if (event->state <= PERF_EVENT_STATE_OFF) return 0; WRITE_ONCE(event->oncpu, smp_processor_id()); /* * Order event::oncpu write to happen before the ACTIVE state is * visible. This allows perf_event_{stop,read}() to observe the correct * ->oncpu if it sees ACTIVE. */ smp_wmb(); perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE); /* * Unthrottle events, since we scheduled we might have missed several * ticks already, also for a heavily scheduling task there is little * guarantee it'll get a tick in a timely manner. */ if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { perf_log_throttle(event, 1); event->hw.interrupts = 0; } perf_pmu_disable(event->pmu); perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); event->oncpu = -1; ret = -EAGAIN; goto out; } if (!is_software_event(event)) cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) { ctx->nr_freq++; epc->nr_freq++; } if (event->attr.exclusive) cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); return ret; } static int group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } } if (!pmu->commit_txn(pmu)) return 0; group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: * The events up to the failed event are scheduled out normally. */ for_each_sibling_event(event, group_event) { if (event == partial_group) break; event_sched_out(event, ctx); } event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); return -EAGAIN; } /* * Work out whether we can put this event group on the CPU now. */ static int group_can_go_on(struct perf_event *event, int can_add_hw) { struct perf_event_pmu_context *epc = event->pmu_ctx; struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu); /* * Groups consisting entirely of software events can always go on. */ if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware * events can go on. */ if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already * events on the CPU, it can't go on. */ if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able * to go on. */ return can_add_hw; } static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); perf_group_attach(event); } static void task_ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; ctx_sched_out(ctx, pmu, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct pmu *pmu) { ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); if (ctx) ctx_sched_in(ctx, pmu, EVENT_PINNED); ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); if (ctx) ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); } /* * We want to maintain the following priority of scheduling: * - CPU pinned (EVENT_CPU | EVENT_PINNED) * - task pinned (EVENT_PINNED) * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) * - task flexible (EVENT_FLEXIBLE). * * In order to avoid unscheduling and scheduling back in everything every * time an event is added, only do it for the groups of equal priority and * below. * * This can be called after a batch operation on task events, in which case * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, struct pmu *pmu, enum event_type_t event_type) { bool cpu_event = !!(event_type & EVENT_CPU); struct perf_event_pmu_context *epc; /* * If pinned groups are involved, flexible groups also need to be * scheduled out. */ if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; event_type &= EVENT_ALL; for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_disable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); } /* * Decide which cpu ctx groups to schedule out based on the types * of events that caused rescheduling: * - EVENT_CPU: schedule out corresponding groups; * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; * - otherwise, do nothing more. */ if (cpu_event) ctx_sched_out(&cpuctx->ctx, pmu, event_type); else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx, pmu); for_each_epc(epc, &cpuctx->ctx, pmu, false) perf_pmu_enable(epc->pmu); if (task_ctx) { for_each_epc(epc, task_ctx, pmu, false) perf_pmu_enable(epc->pmu); } } void perf_pmu_resched(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU); perf_ctx_unlock(cpuctx, task_ctx); } /* * Cross CPU call to install and enable a performance event * * Very similar to remote_function() + event_function() but cannot assume that * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; raw_spin_lock(&cpuctx->ctx.lock); if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; reprogram = (ctx->task == current); /* * If the task is running, it must be running on this CPU, * otherwise we cannot reprogram things. * * If its not running, we don't care, ctx->lock will * serialize against it becoming runnable. */ if (task_curr(ctx->task) && !reprogram) { ret = -ESRCH; goto unlock; } WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx); } else if (task_ctx) { raw_spin_lock(&task_ctx->lock); } #ifdef CONFIG_CGROUP_PERF if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) { /* * If the current cgroup doesn't match the event's * cgroup, we should not try to schedule it. */ struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); reprogram = cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup); } #endif if (reprogram) { ctx_time_freeze(cpuctx, ctx); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } unlock: perf_ctx_unlock(cpuctx, task_ctx); return ret; } static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx); /* * Attach a performance event to a context. * * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx * will be 'complete'. See perf_iterate_sb_cpu(). */ smp_store_release(&event->ctx, ctx); /* * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); return; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); return; } if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; } /* * Should not happen, we validate the ctx is still alive before calling. */ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; /* * Installing events is tricky because we cannot rely on ctx->is_active * to be set in case this is the nr_events 0 -> 1 transition. * * Instead we use task_curr(), which tells us if the task is running. * However, since we use task_curr() outside of rq::lock, we can race * against the actual state. This means the result can be wrong. * * If we get a false positive, we retry, this is harmless. * * If we get a false negative, things are complicated. If we are after * perf_event_context_sched_in() ctx::lock will serialize us, and the * value must be correct. If we're before, it doesn't matter since * perf_event_context_sched_in() will program the counter. * * However, this hinges on the remote context switch having observed * our task->perf_event_ctxp[] store, such that it will in fact take * ctx::lock in perf_event_context_sched_in(). * * We do this by task_function_call(), if the IPI fails to hit the task * we know any future context switch of task must see the * perf_event_ctpx[] store. */ /* * This smp_mb() orders the task->perf_event_ctxp[] store with the * task_cpu() load, such that if the IPI then does not find the task * running, a future context switch of that task must observe the * store. */ smp_mb(); again: if (!task_function_call(task, __perf_install_in_context, event)) return; raw_spin_lock_irq(&ctx->lock); task = ctx->task; if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* * Cannot happen because we already checked above (which also * cannot happen), and we hold ctx->mutex, which serializes us * against perf_event_exit_task_context(). */ raw_spin_unlock_irq(&ctx->lock); return; } /* * If the task is not running, ctx->lock will avoid it becoming so, * thus we can safely install the event. */ if (task_curr(task)) { raw_spin_unlock_irq(&ctx->lock); goto again; } add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to enable a performance event */ static void __perf_event_enable(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, void *info) { struct perf_event *leader = event->group_leader; struct perf_event_context *task_ctx; if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state <= PERF_EVENT_STATE_ERROR) return; ctx_time_freeze(cpuctx, ctx); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); if (!ctx->is_active) return; if (!event_filter_match(event)) return; /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) return; task_ctx = cpuctx->task_ctx; if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event)); } /* * Enable an event. * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); if (event->state >= PERF_EVENT_STATE_INACTIVE || event->state < PERF_EVENT_STATE_ERROR) { out: raw_spin_unlock_irq(&ctx->lock); return; } /* * If the event is in error state, clear that first. * * That way, if we see the event in error state below, we know that it * has gone back into error state, as distinct from the task having * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) { /* * Detached SIBLING events cannot leave ERROR state. */ if (event->event_caps & PERF_EV_CAP_SIBLING && event->group_leader == event) goto out; event->state = PERF_EVENT_STATE_OFF; } raw_spin_unlock_irq(&ctx->lock); event_function_call(event, __perf_event_enable, NULL); } /* * See perf_event_disable(); */ void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx; ctx = perf_event_ctx_lock(event); _perf_event_enable(event); perf_event_ctx_unlock(event, ctx); } EXPORT_SYMBOL_GPL(perf_event_enable); struct stop_event_data { struct perf_event *event; unsigned int restart; }; static int __perf_event_stop(void *info) { struct stop_event_data *sd = info; struct perf_event *event = sd->event; /* if it's already INACTIVE, do nothing */ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * There is a window with interrupts enabled before we get here, * so we need to check again lest we try to stop another CPU's event. */ if (READ_ONCE(event->oncpu) != smp_processor_id()) return -EAGAIN; event->pmu->stop(event, PERF_EF_UPDATE); /* * May race with the actual stop (through perf_pmu_output_stop()), * but it is only used for events with AUX ring buffer, and such * events will refuse to restart because of rb::aux_mmap_count==0, * see comments in perf_aux_output_begin(). * * Since this is happening on an event-local CPU, no trace is lost * while restarting. */ if (sd->restart) event->pmu->start(event, 0); return 0; } static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, .restart = restart, }; int ret = 0; do { if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE) return 0; /* matches smp_wmb() in event_sched_in() */ smp_rmb(); /* * We only want to restart ACTIVE events, so if the event goes * inactive here (event->oncpu==-1), there's nothing more to do; * fall through with ret==-ENXIO. */ ret = cpu_function_call(READ_ONCE(event->oncpu), __perf_event_stop, &sd); } while (ret == -EAGAIN); return ret; } /* * In order to contain the amount of racy and tricky in the address filter * configuration management, it is a two part process: * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. * * If (p1) happens while the event is active, we restart it to force (p2). * * (1) perf_addr_filters_apply(): adjusting filters' offsets based on * pre-existing mappings, called once when new filters arrive via SET_FILTER * ioctl; * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly * registered mapping, called for every new mmap(), with mm::mmap_lock down * for reading; * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process * of exec. */ void perf_event_addr_filters_sync(struct perf_event *event) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); if (!has_addr_filter(event)) return; raw_spin_lock(&ifh->lock); if (event->addr_filters_gen != event->hw.addr_filters_gen) { event->pmu->addr_filters_sync(event); event->hw.addr_filters_gen = event->addr_filters_gen; } raw_spin_unlock(&ifh->lock); } EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync); static int _perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events */ if (event->attr.inherit || !is_sampling_event(event)) return -EINVAL; atomic_add(refresh, &event->event_limit); _perf_event_enable(event); return 0; } /* * See perf_event_disable() */ int perf_event_refresh(struct perf_event *event, int refresh) { struct perf_event_context *ctx; int ret; ctx = perf_event_ctx_lock(event); ret = _perf_event_refresh(event, refresh); perf_event_ctx_unlock(event, ctx); return ret; } EXPORT_SYMBOL_GPL(perf_event_refresh); static int perf_event_modify_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { int err; _perf_event_disable(bp); err = modify_user_hw_breakpoint_check(bp, attr, true); if (!bp->attr.disabled) _perf_event_enable(bp); return err; } /* * Copy event-type-independent attributes that may be modified. */ static void perf_event_modify_copy_attr(struct perf_event_attr *to, const struct perf_event_attr *from) { to->sig_data = from->sig_data; } static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { int (*func)(struct perf_event *, struct perf_event_attr *); struct perf_event *child; int err; if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: func = perf_event_modify_breakpoint; break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); /* * Event-type-independent attributes must be copied before event-type * modification, which will validate that final attributes match the * source attributes after all relevant attributes have been copied. */ perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; } out: mutex_unlock(&event->child_mutex); return err; } static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; struct pmu *pmu = pmu_ctx->pmu; if (ctx->task && !(ctx->is_active & EVENT_ALL)) { struct perf_cpu_pmu_context *cpc = this_cpc(pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = NULL; } if (!(event_type & EVENT_ALL)) return; perf_pmu_disable(pmu); if (event_type & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &pmu_ctx->pinned_active, active_list) group_sched_out(event, ctx); } if (event_type & EVENT_FLEXIBLE) { list_for_each_entry_safe(event, tmp, &pmu_ctx->flexible_active, active_list) group_sched_out(event, ctx); /* * Since we cleared EVENT_FLEXIBLE, also clear * rotate_necessary, is will be reset by * ctx_flexible_sched_in() when needed. */ pmu_ctx->rotate_necessary = 0; } perf_pmu_enable(pmu); } /* * Be very careful with the @pmu argument since this will change ctx state. * The @pmu argument works for ctx_resched(), because that is symmetric in * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant. * * However, if you were to be asymmetrical, you could end up with messed up * state, eg. ctx->is_active cleared even though most EPCs would still actually * be active. */ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) { /* * See __perf_remove_from_context(). */ WARN_ON_ONCE(ctx->is_active); if (ctx->task) WARN_ON_ONCE(cpuctx->task_ctx); return; } /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last * context we sched out. For example: * * ctx_sched_out(.event_type = EVENT_FLEXIBLE) * ctx_sched_out(.event_type = EVENT_PINNED) * * would only update time for the pinned events. */ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); ctx->is_active &= ~event_type; if (!(ctx->is_active & EVENT_ALL)) { /* * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now() * does not observe a hole. perf_ctx_unlock() will clean up. */ if (ctx->is_active & EVENT_FROZEN) ctx->is_active &= EVENT_TIME_FROZEN; else ctx->is_active = 0; } if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); if (!(ctx->is_active & EVENT_ALL)) cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_out(pmu_ctx, is_active); } /* * Test whether two contexts are equivalent, i.e. whether they have both been * cloned from the same version of the same context. * * Equivalence is measured using a generation number in the context that is * incremented on each modification to it; see unclone_ctx(), list_add_event() * and list_del_event(). */ static int context_equiv(struct perf_event_context *ctx1, struct perf_event_context *ctx2) { lockdep_assert_held(&ctx1->lock); lockdep_assert_held(&ctx2->lock); /* Pinning disables the swap optimization */ if (ctx1->pin_count || ctx2->pin_count) return 0; /* If ctx1 is the parent of ctx2 */ if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) return 1; /* If ctx2 is the parent of ctx1 */ if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) return 1; /* * If ctx1 and ctx2 have the same parent; we flatten the parent * hierarchy, see perf_event_init_context(). */ if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen) return 1; /* Unmatched */ return 0; } static void __perf_event_sync_stat(struct perf_event *event, struct perf_event *next_event) { u64 value; if (!event->attr.inherit_stat) return; /* * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however * we know the event must be on the current CPU, therefore we * don't need to use it. */ perf_pmu_read(event); perf_event_update_time(event); /* * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ value = local64_read(&next_event->count); value = local64_xchg(&event->count, value); local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ perf_event_update_userpage(event); perf_event_update_userpage(next_event); } static void perf_event_sync_stat(struct perf_event_context *ctx, struct perf_event_context *next_ctx) { struct perf_event *event, *next_event; if (!ctx->nr_stat) return; update_context_time(ctx); event = list_first_entry(&ctx->event_list, struct perf_event, event_entry); next_event = list_first_entry(&next_ctx->event_list, struct perf_event, event_entry); while (&event->event_entry != &ctx->event_list && &next_event->event_entry != &next_ctx->event_list) { __perf_event_sync_stat(event, next_event); event = list_next_entry(event, event_entry); next_event = list_next_entry(next_event, event_entry); } } static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, struct task_struct *task, bool sched_in) { struct perf_event_pmu_context *pmu_ctx; struct perf_cpu_pmu_context *cpc; list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { cpc = this_cpc(pmu_ctx->pmu); if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in); } } static void perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; int do_switch = 1; if (likely(!ctx)) return; rcu_read_lock(); next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; parent = rcu_dereference(ctx->parent_ctx); next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ if (!parent && !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { /* * Looks like the two contexts are clones, so we might be * able to optimize the context switch. We lock both * contexts and check that they are clones under the * lock (including re-checking that neither has been * uncloned in the meantime). It doesn't matter which * order we take the locks because no other cpu could * be trying to lock both of these tasks. */ raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || local_read(&next_ctx->nr_no_switch_fast)) { /* * Must not swap out ctx when there's pending * events that rely on the ctx->task relation. * * Likewise, when a context contains inherit + * SAMPLE_READ events they should be switched * out using the slow path so that they are * treated as if they were distinct contexts. */ raw_spin_unlock(&next_ctx->lock); rcu_read_unlock(); goto inside_switch; } WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); perf_ctx_sched_task_cb(ctx, task, false); perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not * modified the ctx and the above modification of * ctx->task is immaterial since this value is * always verified under ctx->lock which we're now * holding. */ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; perf_event_sync_stat(ctx, next_ctx); } raw_spin_unlock(&next_ctx->lock); raw_spin_unlock(&ctx->lock); } unlock: rcu_read_unlock(); if (do_switch) { raw_spin_lock(&ctx->lock); perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpc(pmu); this_cpu_dec(perf_sched_cb_usages); barrier(); if (!--cpc->sched_cb_usage) list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { struct perf_cpu_pmu_context *cpc = this_cpc(pmu); if (!cpc->sched_cb_usage++) list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); barrier(); this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. * * This callback is relevant even to per-cpu events; for example multi event * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, struct task_struct *task, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; pmu = cpc->epc.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); pmu->sched_task(cpc->task_epc, task, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cpu_pmu_context *cpc; /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ if (prev == next || cpuctx->task_ctx) return; list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. * * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * not restart the event. */ void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args) { const struct perf_event *le = *(const struct perf_event **)l; const struct perf_event *re = *(const struct perf_event **)r; return le->group_index < re->group_index; } DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap); static const struct min_heap_callbacks perf_min_heap = { .less = perf_less_group_idx, .swp = NULL, }; static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event) { struct perf_event **itrs = heap->data; if (event) { itrs[heap->nr] = event; heap->nr++; } } static void __link_epc(struct perf_event_pmu_context *pmu_ctx) { struct perf_cpu_pmu_context *cpc; if (!pmu_ctx->ctx->task) return; cpc = this_cpc(pmu_ctx->pmu); WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); cpc->task_epc = pmu_ctx; } static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct perf_event_min_heap event_heap; struct perf_event **evt; int ret; if (pmu->filter && pmu->filter(pmu, cpu)) return 0; if (!ctx->task) { cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct perf_event_min_heap){ .data = cpuctx->heap, .nr = 0, .size = cpuctx->heap_size, }; lockdep_assert_held(&cpuctx->ctx.lock); #ifdef CONFIG_CGROUP_PERF if (cpuctx->cgrp) css = &cpuctx->cgrp->css; #endif } else { event_heap = (struct perf_event_min_heap){ .data = itrs, .nr = 0, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif if (event_heap.nr) { __link_epc((*evt)->pmu_ctx); perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); } min_heapify_all_inline(&event_heap, &perf_min_heap, NULL); while (event_heap.nr) { ret = func(*evt, data); if (ret) return ret; *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL); else min_heap_pop_inline(&event_heap, &perf_min_heap, NULL); } return 0; } /* * Because the userpage is strictly per-event (there is no concept of context, * so there cannot be a context indirection), every userpage must be updated * when context time starts :-( * * IOW, we must not miss EVENT_TIME edges. */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); perf_event_update_userpage(event); return true; } static inline void group_update_userpage(struct perf_event *group_event) { struct perf_event *event; if (!event_update_userpage(group_event)) return; for_each_sibling_event(event, group_event) event_update_userpage(event); } static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; if (!event_filter_match(event)) return 0; if (group_can_go_on(event, *can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) event->pending_kill = POLL_HUP; perf_event_wakeup(event); } else { struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); event->pmu_ctx->rotate_necessary = 1; perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } return 0; } static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, struct pmu *pmu) { int can_add_hw = 1; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, merge_sched_in, &can_add_hw); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, enum event_type_t event_type) { struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); if (event_type & EVENT_FLEXIBLE) pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; bool cgroup = event_type & EVENT_CGROUP; event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; if (!(is_active & EVENT_TIME)) { /* start ctx time */ __update_context_time(ctx, false); perf_cgroup_set_timestamp(cpuctx); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); } ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; else WARN_ON_ONCE(cpuctx->task_ctx != ctx); } is_active ^= ctx->is_active; /* changed bits */ /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { for_each_epc(pmu_ctx, ctx, pmu, cgroup) __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); } } static void perf_event_context_sched_in(struct task_struct *task) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx; rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto rcu_unlock; if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, task, true); perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); /* * We must check ctx->nr_events while holding ctx->lock, such * that we serialize against perf_install_in_context(). */ if (!ctx->nr_events) goto unlock; perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. * * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } perf_event_sched_in(cpuctx, ctx, NULL); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) perf_ctx_enable(&cpuctx->ctx, false); perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); rcu_unlock: rcu_read_unlock(); } /* * Called from scheduler to add the events of the current task * with interrupts disabled. * * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of event _before_ * accessing the event control register. If a NMI hits, then it will * keep the event running. */ void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; u64 sec = NSEC_PER_SEC; u64 divisor, dividend; int count_fls, nsec_fls, frequency_fls, sec_fls; count_fls = fls64(count); nsec_fls = fls64(nsec); frequency_fls = fls64(frequency); sec_fls = 30; /* * We got @count in @nsec, with a target of sample_freq HZ * the target period becomes: * * @count * 10^9 * period = ------------------- * @nsec * sample_freq * */ /* * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ #define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ a##_fls--; \ } else { \ b >>= 1; \ b##_fls--; \ } \ } while (0) /* * Reduce accuracy until either term fits in a u64, then proceed with * the other, so that finally we can do a u64/u64 division. */ while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); REDUCE_FLS(sec, count); } if (count_fls + sec_fls > 64) { divisor = nsec * frequency; while (count_fls + sec_fls > 64) { REDUCE_FLS(count, sec); divisor >>= 1; } dividend = count * sec; } else { dividend = count * sec; while (nsec_fls + frequency_fls > 64) { REDUCE_FLS(nsec, frequency); dividend >>= 1; } divisor = nsec * frequency; } if (!divisor) return dividend; return div64_u64(dividend, divisor); } static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); if (delta >= 0) delta += 7; else delta -= 7; delta /= 8; /* low pass filter */ sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { if (disable) event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); if (disable) event->pmu->start(event, PERF_EF_RELOAD); } } static void perf_adjust_freq_unthr_events(struct list_head *event_list) { struct perf_event *event; struct hw_perf_event *hwc; u64 now, period = TICK_NSEC; s64 delta; list_for_each_entry(event, event_list, active_list) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; hwc = &event->hw; if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); if (!event->attr.freq || !event->attr.sample_freq) event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; /* * stop the event and update event->count */ event->pmu->stop(event, PERF_EF_UPDATE); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; /* * restart the event * reload only if value has changed * we have stopped the event so tell that * to perf_adjust_period() to avoid stopping it * twice. */ if (delta > 0) perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } } /* * combine freq adjustment with unthrottling to avoid two passes over the * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event_pmu_context *pmu_ctx; /* * only need to iterate over all events iff: * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { if (!(pmu_ctx->nr_freq || unthrottle)) continue; if (!perf_pmu_ctx_is_active(pmu_ctx)) continue; if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) continue; perf_pmu_disable(pmu_ctx->pmu); perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active); perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active); perf_pmu_enable(pmu_ctx->pmu); } raw_spin_unlock(&ctx->lock); } /* * Move @event to the tail of the @ctx's elegible events. */ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) { /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (ctx->rotate_disable) return; perf_event_groups_delete(&ctx->flexible_groups, event); perf_event_groups_insert(&ctx->flexible_groups, event); } /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; struct rb_node *node; struct rb_root *tree; struct __group_key key = { .pmu = pmu_ctx->pmu, }; /* pick the first active flexible event */ event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); if (event) goto out; /* if no active flexible event, pick the first event */ tree = &pmu_ctx->ctx->flexible_groups.tree; if (!pmu_ctx->ctx->task) { key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); goto out; } key.cpu = -1; node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) { event = __node_2_pe(node); goto out; } key.cpu = smp_processor_id(); node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); if (node) event = __node_2_pe(node); out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ pmu_ctx->rotate_necessary = 0; return event; } static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; int cpu_rotate, task_rotate; struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ cpu_epc = &cpc->epc; pmu = cpu_epc->pmu; task_epc = cpc->task_epc; cpu_rotate = cpu_epc->rotate_necessary; task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate))