91 91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CRC32C *@Article{castagnoli-crc, * author = { Guy Castagnoli and Stefan Braeuer and Martin Herrman}, * title = {{Optimization of Cyclic Redundancy-Check Codes with 24 * and 32 Parity Bits}}, * journal = IEEE Transactions on Communication, * year = {1993}, * volume = {41}, * number = {6}, * pages = {}, * month = {June}, *} * Used by the iSCSI driver, possibly others, and derived from * the iscsi-crc.c module of the linux-iscsi driver at * http://linux-iscsi.sourceforge.net. * * Following the example of lib/crc32, this function is intended to be * flexible and useful for all users. Modules that currently have their * own crc32c, but hopefully may be able to use this one are: * net/sctp (please add all your doco to here if you change to * use this one!) * <endoflist> * * Copyright (c) 2004 Cisco Systems, Inc. */ #include <crypto/hash.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/crc32c.h> static struct crypto_shash *tfm; u32 crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 ret, *ctx = (u32 *)shash_desc_ctx(shash); int err; shash->tfm = tfm; *ctx = crc; err = crypto_shash_update(shash, address, length); BUG_ON(err); ret = *ctx; barrier_data(ctx); return ret; } EXPORT_SYMBOL(crc32c); static int __init libcrc32c_mod_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); return PTR_ERR_OR_ZERO(tfm); } static void __exit libcrc32c_mod_fini(void) { crypto_free_shash(tfm); } module_init(libcrc32c_mod_init); module_exit(libcrc32c_mod_fini); MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>"); MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); |
5 5 6 6 6 6 6 5 5 5 5 5 5 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/errno.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/xarray.h> #include <net/addrconf.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/uverbs_ioctl.h> #include "siw.h" #include "siw_verbs.h" #include "siw_mem.h" static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = { [SIW_QP_STATE_IDLE] = IB_QPS_INIT, [SIW_QP_STATE_RTR] = IB_QPS_RTR, [SIW_QP_STATE_RTS] = IB_QPS_RTS, [SIW_QP_STATE_CLOSING] = IB_QPS_SQD, [SIW_QP_STATE_TERMINATE] = IB_QPS_SQE, [SIW_QP_STATE_ERROR] = IB_QPS_ERR }; static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { [IB_QPS_RESET] = SIW_QP_STATE_IDLE, [IB_QPS_INIT] = SIW_QP_STATE_IDLE, [IB_QPS_RTR] = SIW_QP_STATE_RTR, [IB_QPS_RTS] = SIW_QP_STATE_RTS, [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, [IB_QPS_ERR] = SIW_QP_STATE_ERROR }; static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", [IB_QPS_ERR] = "ERR" }; void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry) { struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry); kfree(entry); } int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) { struct siw_ucontext *uctx = to_siw_ctx(ctx); size_t size = vma->vm_end - vma->vm_start; struct rdma_user_mmap_entry *rdma_entry; struct siw_user_mmap_entry *entry; int rv = -EINVAL; /* * Must be page aligned */ if (vma->vm_start & (PAGE_SIZE - 1)) { pr_warn("siw: mmap not page aligned\n"); return -EINVAL; } rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma); if (!rdma_entry) { siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n", vma->vm_pgoff, size); return -EINVAL; } entry = to_siw_mmap_entry(rdma_entry); rv = remap_vmalloc_range(vma, entry->address, 0); if (rv) pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff, size); rdma_user_mmap_entry_put(rdma_entry); return rv; } int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) { struct siw_device *sdev = to_siw_dev(base_ctx->device); struct siw_ucontext *ctx = to_siw_ctx(base_ctx); struct siw_uresp_alloc_ctx uresp = {}; int rv; if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { rv = -ENOMEM; goto err_out; } ctx->sdev = sdev; uresp.dev_id = sdev->vendor_part_id; if (udata->outlen < sizeof(uresp)) { rv = -EINVAL; goto err_out; } rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rv) goto err_out; siw_dbg(base_ctx->device, "success. now %d context(s)\n", atomic_read(&sdev->num_ctx)); return 0; err_out: atomic_dec(&sdev->num_ctx); siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, atomic_read(&sdev->num_ctx)); return rv; } void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) { struct siw_ucontext *uctx = to_siw_ctx(base_ctx); atomic_dec(&uctx->sdev->num_ctx); } int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, struct ib_udata *udata) { struct siw_device *sdev = to_siw_dev(base_dev); if (udata->inlen || udata->outlen) return -EINVAL; memset(attr, 0, sizeof(*attr)); /* Revisit atomic caps if RFC 7306 gets supported */ attr->atomic_cap = 0; attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG; attr->max_cq = sdev->attrs.max_cq; attr->max_cqe = sdev->attrs.max_cqe; attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; attr->max_mr = sdev->attrs.max_mr; attr->max_mw = sdev->attrs.max_mw; attr->max_mr_size = ~0ull; attr->max_pd = sdev->attrs.max_pd; attr->max_qp = sdev->attrs.max_qp; attr->max_qp_init_rd_atom = sdev->attrs.max_ird; attr->max_qp_rd_atom = sdev->attrs.max_ord; attr->max_qp_wr = sdev->attrs.max_qp_wr; attr->max_recv_sge = sdev->attrs.max_sge; attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; attr->max_send_sge = sdev->attrs.max_sge; attr->max_sge_rd = sdev->attrs.max_sge_rd; attr->max_srq = sdev->attrs.max_srq; attr->max_srq_sge = sdev->attrs.max_srq_sge; attr->max_srq_wr = sdev->attrs.max_srq_wr; attr->page_size_cap = PAGE_SIZE; attr->vendor_id = SIW_VENDOR_ID; attr->vendor_part_id = sdev->vendor_part_id; addrconf_addr_eui48((u8 *)&attr->sys_image_guid, sdev->raw_gid); return 0; } int siw_query_port(struct ib_device *base_dev, u32 port, struct ib_port_attr *attr) { struct net_device *ndev; int rv; memset(attr, 0, sizeof(*attr)); rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, &attr->active_width); if (rv) return rv; ndev = ib_device_get_netdev(base_dev, SIW_PORT); if (!ndev) return -ENODEV; attr->gid_tbl_len = 1; attr->max_msg_sz = -1; attr->max_mtu = ib_mtu_int_to_enum(ndev->max_mtu); attr->active_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); attr->phys_state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; attr->state = attr->phys_state == IB_PORT_PHYS_STATE_LINK_UP ? IB_PORT_ACTIVE : IB_PORT_DOWN; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; /* * All zero * * attr->lid = 0; * attr->bad_pkey_cntr = 0; * attr->qkey_viol_cntr = 0; * attr->sm_lid = 0; * attr->lmc = 0; * attr->max_vl_num = 0; * attr->sm_sl = 0; * attr->subnet_timeout = 0; * attr->init_type_repy = 0; */ dev_put(ndev); return rv; } int siw_get_port_immutable(struct ib_device *base_dev, u32 port, struct ib_port_immutable *port_immutable) { struct ib_port_attr attr; int rv = siw_query_port(base_dev, port, &attr); if (rv) return rv; port_immutable->gid_tbl_len = attr.gid_tbl_len; port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; return 0; } int siw_query_gid(struct ib_device *base_dev, u32 port, int idx, union ib_gid *gid) { struct siw_device *sdev = to_siw_dev(base_dev); /* subnet_prefix == interface_id == 0; */ memset(gid, 0, sizeof(*gid)); memcpy(gid->raw, sdev->raw_gid, ETH_ALEN); return 0; } int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) { struct siw_device *sdev = to_siw_dev(pd->device); if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { atomic_dec(&sdev->num_pd); return -ENOMEM; } siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); return 0; } int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) { struct siw_device *sdev = to_siw_dev(pd->device); siw_dbg_pd(pd, "free PD\n"); atomic_dec(&sdev->num_pd); return 0; } void siw_qp_get_ref(struct ib_qp *base_qp) { siw_qp_get(to_siw_qp(base_qp)); } void siw_qp_put_ref(struct ib_qp *base_qp) { siw_qp_put(to_siw_qp(base_qp)); } static struct rdma_user_mmap_entry * siw_mmap_entry_insert(struct siw_ucontext *uctx, void *address, size_t length, u64 *offset) { struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL); int rv; *offset = SIW_INVAL_UOBJ_KEY; if (!entry) return NULL; entry->address = address; rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext, &entry->rdma_entry, length); if (rv) { kfree(entry); return NULL; } *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); return &entry->rdma_entry; } /* * siw_create_qp() * * Create QP of requested size on given device. * * @qp: Queue pait * @attrs: Initial QP attributes. * @udata: used to provide QP ID, SQ and RQ size back to user. */ int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct ib_pd *pd = ibqp->pd; struct siw_qp *qp = to_siw_qp(ibqp); struct ib_device *base_dev = pd->device; struct siw_device *sdev = to_siw_dev(base_dev); struct siw_ucontext *uctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); unsigned long flags; int num_sqe, num_rqe, rv = 0; size_t length; siw_dbg(base_dev, "create new QP\n"); if (attrs->create_flags) return -EOPNOTSUPP; if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { siw_dbg(base_dev, "too many QP's\n"); rv = -ENOMEM; goto err_atomic; } if (attrs->qp_type != IB_QPT_RC) { siw_dbg(base_dev, "only RC QP's supported\n"); rv = -EOPNOTSUPP; goto err_atomic; } if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || (attrs->cap.max_send_sge > SIW_MAX_SGE) || (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { siw_dbg(base_dev, "QP size error\n"); rv = -EINVAL; goto err_atomic; } if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { siw_dbg(base_dev, "max inline send: %d > %d\n", attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); rv = -EINVAL; goto err_atomic; } /* * NOTE: we don't allow for a QP unable to hold any SQ WQE */ if (attrs->cap.max_send_wr == 0) { siw_dbg(base_dev, "QP must have send queue\n"); rv = -EINVAL; goto err_atomic; } if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) { siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); rv = -EINVAL; goto err_atomic; } init_rwsem(&qp->state_lock); spin_lock_init(&qp->sq_lock); spin_lock_init(&qp->rq_lock); spin_lock_init(&qp->orq_lock); rv = siw_qp_add(sdev, qp); if (rv) goto err_atomic; /* All queue indices are derived from modulo operations * on a free running 'get' (consumer) and 'put' (producer) * unsigned counter. Having queue sizes at power of two * avoids handling counter wrap around. */ num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); num_rqe = attrs->cap.max_recv_wr; if (num_rqe) num_rqe = roundup_pow_of_two(num_rqe); if (udata) qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); else qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe)); if (qp->sendq == NULL) { rv = -ENOMEM; goto err_out_xa; } if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) qp->attrs.flags |= SIW_SIGNAL_ALL_WR; else { rv = -EINVAL; goto err_out_xa; } } qp->pd = pd; qp->scq = to_siw_cq(attrs->send_cq); qp->rcq = to_siw_cq(attrs->recv_cq); if (attrs->srq) { /* * SRQ support. * Verbs 6.3.7: ignore RQ size, if SRQ present * Verbs 6.3.5: do not check PD of SRQ against PD of QP */ qp->srq = to_siw_srq(attrs->srq); qp->attrs.rq_size = 0; siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->base_qp.qp_num); } else if (num_rqe) { if (udata) qp->recvq = vmalloc_user(num_rqe * sizeof(struct siw_rqe)); else qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe)); if (qp->recvq == NULL) { rv = -ENOMEM; goto err_out_xa; } qp->attrs.rq_size = num_rqe; } qp->attrs.sq_size = num_sqe; qp->attrs.sq_max_sges = attrs->cap.max_send_sge; qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; /* Make those two tunables fixed for now. */ qp->tx_ctx.gso_seg_limit = 1; qp->tx_ctx.zcopy_tx = zcopy_tx; qp->attrs.state = SIW_QP_STATE_IDLE; if (udata) { struct siw_uresp_create_qp uresp = {}; uresp.num_sqe = num_sqe; uresp.num_rqe = num_rqe; uresp.qp_id = qp_id(qp); if (qp->sendq) { length = num_sqe * sizeof(struct siw_sqe); qp->sq_entry = siw_mmap_entry_insert(uctx, qp->sendq, length, &uresp.sq_key); if (!qp->sq_entry) { rv = -ENOMEM; goto err_out_xa; } } if (qp->recvq) { length = num_rqe * sizeof(struct siw_rqe); qp->rq_entry = siw_mmap_entry_insert(uctx, qp->recvq, length, &uresp.rq_key); if (!qp->rq_entry) { uresp.sq_key = SIW_INVAL_UOBJ_KEY; rv = -ENOMEM; goto err_out_xa; } } if (udata->outlen < sizeof(uresp)) { rv = -EINVAL; goto err_out_xa; } rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rv) goto err_out_xa; } qp->tx_cpu = siw_get_tx_cpu(sdev); if (qp->tx_cpu < 0) { rv = -EINVAL; goto err_out_xa; } INIT_LIST_HEAD(&qp->devq); spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&qp->devq, &sdev->qp_list); spin_unlock_irqrestore(&sdev->lock, flags); init_completion(&qp->qp_free); return 0; err_out_xa: xa_erase(&sdev->qp_xa, qp_id(qp)); if (uctx) { rdma_user_mmap_entry_remove(qp->sq_entry); rdma_user_mmap_entry_remove(qp->rq_entry); } vfree(qp->sendq); vfree(qp->recvq); err_atomic: atomic_dec(&sdev->num_qp); return rv; } /* * Minimum siw_query_qp() verb interface. * * @qp_attr_mask is not used but all available information is provided */ int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { struct siw_qp *qp; struct net_device *ndev; if (base_qp && qp_attr && qp_init_attr) qp = to_siw_qp(base_qp); else return -EINVAL; ndev = ib_device_get_netdev(base_qp->device, SIW_PORT); if (!ndev) return -ENODEV; qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; qp_attr->cap.max_inline_data = SIW_MAX_INLINE; qp_attr->cap.max_send_wr = qp->attrs.sq_size; qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; qp_attr->cap.max_recv_wr = qp->attrs.rq_size; qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; qp_attr->path_mtu = ib_mtu_int_to_enum(READ_ONCE(ndev->mtu)); qp_attr->max_rd_atomic = qp->attrs.irq_size; qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; qp_init_attr->qp_type = base_qp->qp_type; qp_init_attr->send_cq = base_qp->send_cq; qp_init_attr->recv_cq = base_qp->recv_cq; qp_init_attr->srq = base_qp->srq; qp_init_attr->cap = qp_attr->cap; dev_put(ndev); return 0; } int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct siw_qp_attrs new_attrs; enum siw_qp_attr_mask siw_attr_mask = 0; struct siw_qp *qp = to_siw_qp(base_qp); int rv = 0; if (!attr_mask) return 0; if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) return -EOPNOTSUPP; memset(&new_attrs, 0, sizeof(new_attrs)); if (attr_mask & IB_QP_ACCESS_FLAGS) { siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) new_attrs.flags |= SIW_RDMA_READ_ENABLED; if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; if (attr->qp_access_flags & IB_ACCESS_MW_BIND) new_attrs.flags |= SIW_RDMA_BIND_ENABLED; } if (attr_mask & IB_QP_STATE) { siw_dbg_qp(qp, "desired IB QP state: %s\n", ib_qp_state_to_string[attr->qp_state]); new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; if (new_attrs.state > SIW_QP_STATE_RTS) qp->tx_ctx.tx_suspend = 1; siw_attr_mask |= SIW_QP_ATTR_STATE; } if (!siw_attr_mask) goto out; down_write(&qp->state_lock); rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); up_write(&qp->state_lock); out: return rv; } int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) { struct siw_qp *qp = to_siw_qp(base_qp); struct siw_ucontext *uctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); struct siw_qp_attrs qp_attrs; siw_dbg_qp(qp, "state %d\n", qp->attrs.state); /* * Mark QP as in process of destruction to prevent from * any async callbacks to RDMA core */ qp->attrs.flags |= SIW_QP_IN_DESTROY; qp->rx_stream.rx_suspend = 1; if (uctx) { rdma_user_mmap_entry_remove(qp->sq_entry); rdma_user_mmap_entry_remove(qp->rq_entry); } down_write(&qp->state_lock); qp_attrs.state = SIW_QP_STATE_ERROR; siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); if (qp->cep) { siw_cep_put(qp->cep); qp->cep = NULL; } up_write(&qp->state_lock); kfree(qp->tx_ctx.mpa_crc_hd); kfree(qp->rx_stream.mpa_crc_hd); qp->scq = qp->rcq = NULL; siw_qp_put(qp); wait_for_completion(&qp->qp_free); return 0; } /* * siw_copy_inline_sgl() * * Prepare sgl of inlined data for sending. For userland callers * function checks if given buffer addresses and len's are within * process context bounds. * Data from all provided sge's are copied together into the wqe, * referenced by a single sge. */ static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, struct siw_sqe *sqe) { struct ib_sge *core_sge = core_wr->sg_list; void *kbuf = &sqe->sge[1]; int num_sge = core_wr->num_sge, bytes = 0; sqe->sge[0].laddr = (uintptr_t)kbuf; sqe->sge[0].lkey = 0; while (num_sge--) { if (!core_sge->length) { core_sge++; continue; } bytes += core_sge->length; if (bytes > SIW_MAX_INLINE) { bytes = -EINVAL; break; } memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr), core_sge->length); kbuf += core_sge->length; core_sge++; } sqe->sge[0].length = max(bytes, 0); sqe->num_sge = bytes > 0 ? 1 : 0; return bytes; } /* Complete SQ WR's without processing */ static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { int rv = 0; while (wr) { struct siw_sqe sqe = {}; switch (wr->opcode) { case IB_WR_RDMA_WRITE: sqe.opcode = SIW_OP_WRITE; break; case IB_WR_RDMA_READ: sqe.opcode = SIW_OP_READ; break; case IB_WR_RDMA_READ_WITH_INV: sqe.opcode = SIW_OP_READ_LOCAL_INV; break; case IB_WR_SEND: sqe.opcode = SIW_OP_SEND; break; case IB_WR_SEND_WITH_IMM: sqe.opcode = SIW_OP_SEND_WITH_IMM; break; case IB_WR_SEND_WITH_INV: sqe.opcode = SIW_OP_SEND_REMOTE_INV; break; case IB_WR_LOCAL_INV: sqe.opcode = SIW_OP_INVAL_STAG; break; case IB_WR_REG_MR: sqe.opcode = SIW_OP_REG_MR; break; default: rv = -EINVAL; break; } if (!rv) { sqe.id = wr->wr_id; rv = siw_sqe_complete(qp, &sqe, 0, SIW_WC_WR_FLUSH_ERR); } if (rv) { if (bad_wr) *bad_wr = wr; break; } wr = wr->next; } return rv; } /* Complete RQ WR's without processing */ static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct siw_rqe rqe = {}; int rv = 0; while (wr) { rqe.id = wr->wr_id; rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR); if (rv) { if (bad_wr) *bad_wr = wr; break; } wr = wr->next; } return rv; } /* * siw_post_send() * * Post a list of S-WR's to a SQ. * * @base_qp: Base QP contained in siw QP * @wr: Null terminated list of user WR's * @bad_wr: Points to failing WR in case of synchronous failure. */ int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct siw_qp *qp = to_siw_qp(base_qp); struct siw_wqe *wqe = tx_wqe(qp); unsigned long flags; int rv = 0; if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) { siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); *bad_wr = wr; return -EINVAL; } /* * Try to acquire QP state lock. Must be non-blocking * to accommodate kernel clients needs. */ if (!down_read_trylock(&qp->state_lock)) { if (qp->attrs.state == SIW_QP_STATE_ERROR) { /* * ERROR state is final, so we can be sure * this state will not change as long as the QP * exists. * * This handles an ib_drain_sq() call with * a concurrent request to set the QP state * to ERROR. */ rv = siw_sq_flush_wr(qp, wr, bad_wr); } else { siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); *bad_wr = wr; rv = -ENOTCONN; } return rv; } if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { if (qp->attrs.state == SIW_QP_STATE_ERROR) { /* * Immediately flush this WR to CQ, if QP * is in ERROR state. SQ is guaranteed to * be empty, so WR complets in-order. * * Typically triggered by ib_drain_sq(). */ rv = siw_sq_flush_wr(qp, wr, bad_wr); } else { siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); *bad_wr = wr; rv = -ENOTCONN; } up_read(&qp->state_lock); return rv; } spin_lock_irqsave(&qp->sq_lock, flags); while (wr) { u32 idx = qp->sq_put % qp->attrs.sq_size; struct siw_sqe *sqe = &qp->sendq[idx]; if (sqe->flags) { siw_dbg_qp(qp, "sq full\n"); rv = -ENOMEM; break; } if (wr->num_sge > qp->attrs.sq_max_sges) { siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); rv = -EINVAL; break; } sqe->id = wr->wr_id; if ((wr->send_flags & IB_SEND_SIGNALED) || (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) sqe->flags |= SIW_WQE_SIGNALLED; if (wr->send_flags & IB_SEND_FENCE) sqe->flags |= SIW_WQE_READ_FENCE; switch (wr->opcode) { case IB_WR_SEND: case IB_WR_SEND_WITH_INV: if (wr->send_flags & IB_SEND_SOLICITED) sqe->flags |= SIW_WQE_SOLICITED; if (!(wr->send_flags & IB_SEND_INLINE)) { siw_copy_sgl(wr->sg_list, sqe->sge, wr->num_sge); sqe->num_sge = wr->num_sge; } else { rv = siw_copy_inline_sgl(wr, sqe); if (rv <= 0) { rv = -EINVAL; break; } sqe->flags |= SIW_WQE_INLINE; sqe->num_sge = 1; } if (wr->opcode == IB_WR_SEND) sqe->opcode = SIW_OP_SEND; else { sqe->opcode = SIW_OP_SEND_REMOTE_INV; sqe->rkey = wr->ex.invalidate_rkey; } break; case IB_WR_RDMA_READ_WITH_INV: case IB_WR_RDMA_READ: /* * iWarp restricts RREAD sink to SGL containing * 1 SGE only. we could relax to SGL with multiple * elements referring the SAME ltag or even sending * a private per-rreq tag referring to a checked * local sgl with MULTIPLE ltag's. */ if (unlikely(wr->num_sge != 1)) { rv = -EINVAL; break; } siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); /* * NOTE: zero length RREAD is allowed! */ sqe->raddr = rdma_wr(wr)->remote_addr; sqe->rkey = rdma_wr(wr)->rkey; sqe->num_sge = 1; if (wr->opcode == IB_WR_RDMA_READ) sqe->opcode = SIW_OP_READ; else sqe->opcode = SIW_OP_READ_LOCAL_INV; break; case IB_WR_RDMA_WRITE: if (!(wr->send_flags & IB_SEND_INLINE)) { siw_copy_sgl(wr->sg_list, &sqe->sge[0], wr->num_sge); sqe->num_sge = wr->num_sge; } else { rv = siw_copy_inline_sgl(wr, sqe); if (unlikely(rv < 0)) { rv = -EINVAL; break; } sqe->flags |= SIW_WQE_INLINE; sqe->num_sge = 1; } sqe->raddr = rdma_wr(wr)->remote_addr; sqe->rkey = rdma_wr(wr)->rkey; sqe->opcode = SIW_OP_WRITE; break; case IB_WR_REG_MR: sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; sqe->rkey = reg_wr(wr)->key; sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; sqe->opcode = SIW_OP_REG_MR; break; case IB_WR_LOCAL_INV: sqe->rkey = wr->ex.invalidate_rkey; sqe->opcode = SIW_OP_INVAL_STAG; break; default: siw_dbg_qp(qp, "ib wr type %d unsupported\n", wr->opcode); rv = -EINVAL; break; } siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", sqe->opcode, sqe->flags, (void *)(uintptr_t)sqe->id); if (unlikely(rv < 0)) break; /* make SQE only valid after completely written */ smp_wmb(); sqe->flags |= SIW_WQE_VALID; qp->sq_put++; wr = wr->next; } /* * Send directly if SQ processing is not in progress. * Eventual immediate errors (rv < 0) do not affect the involved * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ * processing, if new work is already pending. But rv must be passed * to caller. */ if (wqe->wr_status != SIW_WR_IDLE) { spin_unlock_irqrestore(&qp->sq_lock, flags); goto skip_direct_sending; } rv = siw_activate_tx(qp); spin_unlock_irqrestore(&qp->sq_lock, flags); if (rv <= 0) goto skip_direct_sending; if (rdma_is_kernel_res(&qp->base_qp.res)) { rv = siw_sq_start(qp); } else { qp->tx_ctx.in_syscall = 1; if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) siw_qp_cm_drop(qp, 0); qp->tx_ctx.in_syscall = 0; } skip_direct_sending: up_read(&qp->state_lock); if (rv >= 0) return 0; /* * Immediate error */ siw_dbg_qp(qp, "error %d\n", rv); *bad_wr = wr; return rv; } /* * siw_post_receive() * * Post a list of R-WR's to a RQ. * * @base_qp: Base QP contained in siw QP * @wr: Null terminated list of user WR's * @bad_wr: Points to failing WR in case of synchronous failure. */ int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct siw_qp *qp = to_siw_qp(base_qp); unsigned long flags; int rv = 0; if (qp->srq || qp->attrs.rq_size == 0) { *bad_wr = wr; return -EINVAL; } if (!rdma_is_kernel_res(&qp->base_qp.res)) { siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n"); *bad_wr = wr; return -EINVAL; } /* * Try to acquire QP state lock. Must be non-blocking * to accommodate kernel clients needs. */ if (!down_read_trylock(&qp->state_lock)) { if (qp->attrs.state == SIW_QP_STATE_ERROR) { /* * ERROR state is final, so we can be sure * this state will not change as long as the QP * exists. * * This handles an ib_drain_rq() call with * a concurrent request to set the QP state * to ERROR. */ rv = siw_rq_flush_wr(qp, wr, bad_wr); } else { siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); *bad_wr = wr; rv = -ENOTCONN; } return rv; } if (qp->attrs.state > SIW_QP_STATE_RTS) { if (qp->attrs.state == SIW_QP_STATE_ERROR) { /* * Immediately flush this WR to CQ, if QP * is in ERROR state. RQ is guaranteed to * be empty, so WR complets in-order. * * Typically triggered by ib_drain_rq(). */ rv = siw_rq_flush_wr(qp, wr, bad_wr); } else { siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); *bad_wr = wr; rv = -ENOTCONN; } up_read(&qp->state_lock); return rv; } /* * Serialize potentially multiple producers. * Not needed for single threaded consumer side. */ spin_lock_irqsave(&qp->rq_lock, flags); while (wr) { u32 idx = qp->rq_put % qp->attrs.rq_size; struct siw_rqe *rqe = &qp->recvq[idx]; if (rqe->flags) { siw_dbg_qp(qp, "RQ full\n"); rv = -ENOMEM; break; } if (wr->num_sge > qp->attrs.rq_max_sges) { siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); rv = -EINVAL; break; } rqe->id = wr->wr_id; rqe->num_sge = wr->num_sge; siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); /* make sure RQE is completely written before valid */ smp_wmb(); rqe->flags = SIW_WQE_VALID; qp->rq_put++; wr = wr->next; } spin_unlock_irqrestore(&qp->rq_lock, flags); up_read(&qp->state_lock); if (rv < 0) { siw_dbg_qp(qp, "error %d\n", rv); *bad_wr = wr; } return rv > 0 ? 0 : rv; } int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) { struct siw_cq *cq = to_siw_cq(base_cq); struct siw_device *sdev = to_siw_dev(base_cq->device); struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); siw_dbg_cq(cq, "free CQ resources\n"); siw_cq_flush(cq); if (ctx) rdma_user_mmap_entry_remove(cq->cq_entry); atomic_dec(&sdev->num_cq); vfree(cq->queue); return 0; } /* * siw_create_cq() * * Populate CQ of requested size * * @base_cq: CQ as allocated by RDMA midlayer * @attr: Initial CQ attributes * @attrs: uverbs bundle */ int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct siw_device *sdev = to_siw_dev(base_cq->device); struct siw_cq *cq = to_siw_cq(base_cq); int rv, size = attr->cqe; if (attr->flags) return -EOPNOTSUPP; if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { siw_dbg(base_cq->device, "too many CQ's\n"); rv = -ENOMEM; goto err_out; } if (size < 1 || size > sdev->attrs.max_cqe) { siw_dbg(base_cq->device, "CQ size error: %d\n", size); rv = -EINVAL; goto err_out; } size = roundup_pow_of_two(size); cq->base_cq.cqe = size; cq->num_cqe = size; if (udata) cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + sizeof(struct siw_cq_ctrl)); else cq->queue = vzalloc(size * sizeof(struct siw_cqe) + sizeof(struct siw_cq_ctrl)); if (cq->queue == NULL) { rv = -ENOMEM; goto err_out; } get_random_bytes(&cq->id, 4); siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); spin_lock_init(&cq->lock); cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; if (udata) { struct siw_uresp_create_cq uresp = {}; struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); size_t length = size * sizeof(struct siw_cqe) + sizeof(struct siw_cq_ctrl); cq->cq_entry = siw_mmap_entry_insert(ctx, cq->queue, length, &uresp.cq_key); if (!cq->cq_entry) { rv = -ENOMEM; goto err_out; } uresp.cq_id = cq->id; uresp.num_cqe = size; if (udata->outlen < sizeof(uresp)) { rv = -EINVAL; goto err_out; } rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rv) goto err_out; } return 0; err_out: siw_dbg(base_cq->device, "CQ creation failed: %d", rv); if (cq->queue) { struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); if (ctx) rdma_user_mmap_entry_remove(cq->cq_entry); vfree(cq->queue); } atomic_dec(&sdev->num_cq); return rv; } /* * siw_poll_cq() * * Reap CQ entries if available and copy work completion status into * array of WC's provided by caller. Returns number of reaped CQE's. * * @base_cq: Base CQ contained in siw CQ. * @num_cqe: Maximum number of CQE's to reap. * @wc: Array of work completions to be filled by siw. */ int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) { struct siw_cq *cq = to_siw_cq(base_cq); int i; for (i = 0; i < num_cqe; i++) { if (!siw_reap_cqe(cq, wc)) break; wc++; } return i; } /* * siw_req_notify_cq() * * Request notification for new CQE's added to that CQ. * Defined flags: * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification * event if a WQE with notification flag set enters the CQ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification * event if a WQE enters the CQ. * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the * number of not reaped CQE's regardless of its notification * type and current or new CQ notification settings. * * @base_cq: Base CQ contained in siw CQ. * @flags: Requested notification flags. */ int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) { struct siw_cq *cq = to_siw_cq(base_cq); siw_dbg_cq(cq, "flags: 0x%02x\n", flags); if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) /* * Enable CQ event for next solicited completion. * and make it visible to all associated producers. */ smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); else /* * Enable CQ event for any signalled completion. * and make it visible to all associated producers. */ smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); if (flags & IB_CQ_REPORT_MISSED_EVENTS) return cq->cq_put - cq->cq_get; return 0; } /* * siw_dereg_mr() * * Release Memory Region. * * @base_mr: Base MR contained in siw MR. * @udata: points to user context, unused. */ int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) { struct siw_mr *mr = to_siw_mr(base_mr); struct siw_device *sdev = to_siw_dev(base_mr->device); siw_dbg_mem(mr->mem, "deregister MR\n"); atomic_dec(&sdev->num_mr); siw_mr_drop_mem(mr); kfree_rcu(mr, rcu); return 0; } /* * siw_reg_user_mr() * * Register Memory Region. * * @pd: Protection Domain * @start: starting address of MR (virtual address) * @len: len of MR * @rnic_va: not used by siw * @rights: MR access rights * @udata: user buffer to communicate STag and Key. */ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, u64 rnic_va, int rights, struct ib_udata *udata) { struct siw_mr *mr = NULL; struct siw_umem *umem = NULL; struct siw_ureq_reg_mr ureq; struct siw_device *sdev = to_siw_dev(pd->device); int rv; siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; goto err_out; } if (!len) { rv = -EINVAL; goto err_out; } umem = siw_umem_get(pd->device, start, len, rights); if (IS_ERR(umem)) { rv = PTR_ERR(umem); siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); umem = NULL; goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { rv = -ENOMEM; goto err_out; } rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); if (rv) goto err_out; if (udata) { struct siw_uresp_reg_mr uresp = {}; struct siw_mem *mem = mr->mem; if (udata->inlen < sizeof(ureq)) { rv = -EINVAL; goto err_out; } rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); if (rv) goto err_out; mr->base_mr.lkey |= ureq.stag_key; mr->base_mr.rkey |= ureq.stag_key; mem->stag |= ureq.stag_key; uresp.stag = mem->stag; if (udata->outlen < sizeof(uresp)) { rv = -EINVAL; goto err_out; } rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rv) goto err_out; } mr->mem->stag_valid = 1; return &mr->base_mr; err_out: atomic_dec(&sdev->num_mr); if (mr) { if (mr->mem) siw_mr_drop_mem(mr); kfree_rcu(mr, rcu); } else { if (umem) siw_umem_release(umem); } return ERR_PTR(rv); } struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_sge) { struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mr *mr = NULL; struct siw_pbl *pbl = NULL; int rv; if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; goto err_out; } if (mr_type != IB_MR_TYPE_MEM_REG) { siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); rv = -EOPNOTSUPP; goto err_out; } if (max_sge > SIW_MAX_SGE_PBL) { siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); rv = -ENOMEM; goto err_out; } pbl = siw_pbl_alloc(max_sge); if (IS_ERR(pbl)) { rv = PTR_ERR(pbl); siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); pbl = NULL; goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { rv = -ENOMEM; goto err_out; } rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); if (rv) goto err_out; mr->mem->is_pbl = 1; siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); return &mr->base_mr; err_out: atomic_dec(&sdev->num_mr); if (!mr) { kfree(pbl); } else { if (mr->mem) siw_mr_drop_mem(mr); kfree_rcu(mr, rcu); } siw_dbg_pd(pd, "failed: %d\n", rv); return ERR_PTR(rv); } /* Just used to count number of pages being mapped */ static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) { return 0; } int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, unsigned int *sg_off) { struct scatterlist *slp; struct siw_mr *mr = to_siw_mr(base_mr); struct siw_mem *mem = mr->mem; struct siw_pbl *pbl = mem->pbl; struct siw_pble *pble; unsigned long pbl_size; int i, rv; if (!pbl) { siw_dbg_mem(mem, "no PBL allocated\n"); return -EINVAL; } pble = pbl->pbe; if (pbl->max_buf < num_sle) { siw_dbg_mem(mem, "too many SGE's: %d > %d\n", num_sle, pbl->max_buf); return -ENOMEM; } for_each_sg(sl, slp, num_sle, i) { if (sg_dma_len(slp) == 0) { siw_dbg_mem(mem, "empty SGE\n"); return -EINVAL; } if (i == 0) { pble->addr = sg_dma_address(slp); pble->size = sg_dma_len(slp); pble->pbl_off = 0; pbl_size = pble->size; pbl->num_buf = 1; } else { /* Merge PBL entries if adjacent */ if (pble->addr + pble->size == sg_dma_address(slp)) { pble->size += sg_dma_len(slp); } else { pble++; pbl->num_buf++; pble->addr = sg_dma_address(slp); pble->size = sg_dma_len(slp); pble->pbl_off = pbl_size; } pbl_size += sg_dma_len(slp); } siw_dbg_mem(mem, "sge[%d], size %u, addr 0x%p, total %lu\n", i, pble->size, ib_virt_dma_to_ptr(pble->addr), pbl_size); } rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); if (rv > 0) { mem->len = base_mr->length; mem->va = base_mr->iova; siw_dbg_mem(mem, "%llu bytes, start 0x%pK, %u SLE to %u entries\n", mem->len, (void *)(uintptr_t)mem->va, num_sle, pbl->num_buf); } return rv; } /* * siw_get_dma_mr() * * Create a (empty) DMA memory region, where no umem is attached. */ struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) { struct siw_device *sdev = to_siw_dev(pd->device); struct siw_mr *mr = NULL; int rv; if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { rv = -ENOMEM; goto err_out; } rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); if (rv) goto err_out; mr->mem->stag_valid = 1; siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); return &mr->base_mr; err_out: if (rv) kfree(mr); atomic_dec(&sdev->num_mr); return ERR_PTR(rv); } /* * siw_create_srq() * * Create Shared Receive Queue of attributes @init_attrs * within protection domain given by @pd. * * @base_srq: Base SRQ contained in siw SRQ. * @init_attrs: SRQ init attributes. * @udata: points to user context */ int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) { struct siw_srq *srq = to_siw_srq(base_srq); struct ib_srq_attr *attrs = &init_attrs->attr; struct siw_device *sdev = to_siw_dev(base_srq->device); struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); int rv; if (init_attrs->srq_type != IB_SRQT_BASIC) return -EOPNOTSUPP; if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); rv = -ENOMEM; goto err_out; } if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { rv = -EINVAL; goto err_out; } srq->max_sge = attrs->max_sge; srq->num_rqe = roundup_pow_of_two(attrs->max_wr); srq->limit = attrs->srq_limit; if (srq->limit) srq->armed = true; srq->is_kernel_res = !udata; if (udata) srq->recvq = vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); else srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe)); if (srq->recvq == NULL) { rv = -ENOMEM; goto err_out; } if (udata) { struct siw_uresp_create_srq uresp = {}; size_t length = srq->num_rqe * sizeof(struct siw_rqe); srq->srq_entry = siw_mmap_entry_insert(ctx, srq->recvq, length, &uresp.srq_key); if (!srq->srq_entry) { rv = -ENOMEM; goto err_out; } uresp.num_rqe = srq->num_rqe; if (udata->outlen < sizeof(uresp)) { rv = -EINVAL; goto err_out; } rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); if (rv) goto err_out; } spin_lock_init(&srq->lock); siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); return 0; err_out: if (srq->recvq) { if (ctx) rdma_user_mmap_entry_remove(srq->srq_entry); vfree(srq->recvq); } atomic_dec(&sdev->num_srq); return rv; } /* * siw_modify_srq() * * Modify SRQ. The caller may resize SRQ and/or set/reset notification * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. * * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE * parameter. siw_modify_srq() does not check the attrs->max_sge param. */ int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) { struct siw_srq *srq = to_siw_srq(base_srq); unsigned long flags; int rv = 0; spin_lock_irqsave(&srq->lock, flags); if (attr_mask & IB_SRQ_MAX_WR) { /* resize request not yet supported */ rv = -EOPNOTSUPP; goto out; } if (attr_mask & IB_SRQ_LIMIT) { if (attrs->srq_limit) { if (unlikely(attrs->srq_limit > srq->num_rqe)) { rv = -EINVAL; goto out; } srq->armed = true; } else { srq->armed = false; } srq->limit = attrs->srq_limit; } out: spin_unlock_irqrestore(&srq->lock, flags); return rv; } /* * siw_query_srq() * * Query SRQ attributes. */ int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) { struct siw_srq *srq = to_siw_srq(base_srq); unsigned long flags; spin_lock_irqsave(&srq->lock, flags); attrs->max_wr = srq->num_rqe; attrs->max_sge = srq->max_sge; attrs->srq_limit = srq->limit; spin_unlock_irqrestore(&srq->lock, flags); return 0; } /* * siw_destroy_srq() * * Destroy SRQ. * It is assumed that the SRQ is not referenced by any * QP anymore - the code trusts the RDMA core environment to keep track * of QP references. */ int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) { struct siw_srq *srq = to_siw_srq(base_srq); struct siw_device *sdev = to_siw_dev(base_srq->device); struct siw_ucontext *ctx = rdma_udata_to_drv_context(udata, struct siw_ucontext, base_ucontext); if (ctx) rdma_user_mmap_entry_remove(srq->srq_entry); vfree(srq->recvq); atomic_dec(&sdev->num_srq); return 0; } /* * siw_post_srq_recv() * * Post a list of receive queue elements to SRQ. * NOTE: The function does not check or lock a certain SRQ state * during the post operation. The code simply trusts the * RDMA core environment. * * @base_srq: Base SRQ contained in siw SRQ * @wr: List of R-WR's * @bad_wr: Updated to failing WR if posting fails. */ int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct siw_srq *srq = to_siw_srq(base_srq); unsigned long flags; int rv = 0; if (unlikely(!srq->is_kernel_res)) { siw_dbg_pd(base_srq->pd, "[SRQ]: no kernel post_recv for mapped srq\n"); rv = -EINVAL; goto out; } /* * Serialize potentially multiple producers. * Also needed to serialize potentially multiple * consumers. */ spin_lock_irqsave(&srq->lock, flags); while (wr) { u32 idx = srq->rq_put % srq->num_rqe; struct siw_rqe *rqe = &srq->recvq[idx]; if (rqe->flags) { siw_dbg_pd(base_srq->pd, "SRQ full\n"); rv = -ENOMEM; break; } if (unlikely(wr->num_sge > srq->max_sge)) { siw_dbg_pd(base_srq->pd, "[SRQ]: too many sge's: %d\n", wr->num_sge); rv = -EINVAL; break; } rqe->id = wr->wr_id; rqe->num_sge = wr->num_sge; siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); /* Make sure S-RQE is completely written before valid */ smp_wmb(); rqe->flags = SIW_WQE_VALID; srq->rq_put++; wr = wr->next; } spin_unlock_irqrestore(&srq->lock, flags); out: if (unlikely(rv < 0)) { siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); *bad_wr = wr; } return rv; } void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) { struct ib_event event; struct ib_qp *base_qp = &qp->base_qp; /* * Do not report asynchronous errors on QP which gets * destroyed via verbs interface (siw_destroy_qp()) */ if (qp->attrs.flags & SIW_QP_IN_DESTROY) return; event.event = etype; event.device = base_qp->device; event.element.qp = base_qp; if (base_qp->event_handler) { siw_dbg_qp(qp, "reporting event %d\n", etype); base_qp->event_handler(&event, base_qp->qp_context); } } void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) { struct ib_event event; struct ib_cq *base_cq = &cq->base_cq; event.event = etype; event.device = base_cq->device; event.element.cq = base_cq; if (base_cq->event_handler) { siw_dbg_cq(cq, "reporting CQ event %d\n", etype); base_cq->event_handler(&event, base_cq->cq_context); } } void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) { struct ib_event event; struct ib_srq *base_srq = &srq->base_srq; event.event = etype; event.device = base_srq->device; event.element.srq = base_srq; if (base_srq->event_handler) { siw_dbg_pd(srq->base_srq.pd, "reporting SRQ event %d\n", etype); base_srq->event_handler(&event, base_srq->srq_context); } } void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype) { struct ib_event event; event.event = etype; event.device = &sdev->base_dev; event.element.port_num = port; siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); ib_dispatch_event(&event); } |
156 103 103 103 103 103 30 30 21 6 19 5 4 1 81 77 7 74 21 21 77 24 7 1 27 27 2 1 27 20 12 4 13 7 8 13 13 11 8 21 12 2 2 1 1 2 21 19 2 7 6 1 6 2 6 2 14 21 21 1 7 1 5 5 1 1 1 3 3 1 1 15 15 3 5 3 19 1 6 14 1 1 1 1 1 6 1 5 6 6 5 6 5 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This abstraction carries sctp events to the ULP (sockets). * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/busy_poll.h> #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *); static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *, struct sctp_ulpevent *); static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); /* 1st Level Abstractions */ /* Initialize a ULP queue from a block of memory. */ void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc) { memset(ulpq, 0, sizeof(struct sctp_ulpq)); ulpq->asoc = asoc; skb_queue_head_init(&ulpq->reasm); skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; } /* Flush the reassembly and ordering queues. */ void sctp_ulpq_flush(struct sctp_ulpq *ulpq) { struct sk_buff *skb; struct sctp_ulpevent *event; while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } while ((skb = __skb_dequeue(&ulpq->reasm_uo)) != NULL) { event = sctp_skb2event(skb); sctp_ulpevent_free(event); } } /* Dispose of a ulpqueue. */ void sctp_ulpq_free(struct sctp_ulpq *ulpq) { sctp_ulpq_flush(ulpq); } /* Process an incoming DATA chunk. */ int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sk_buff_head temp; struct sctp_ulpevent *event; int event_eor = 0; /* Create an event from the incoming chunk. */ event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->ssn = ntohs(chunk->subh.data_hdr->ssn); event->ppid = chunk->subh.data_hdr->ppid; /* Do reassembly if needed. */ event = sctp_ulpq_reasm(ulpq, event); /* Do ordering if needed. */ if (event) { /* Create a temporary list to collect chunks on. */ skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); } /* Send event to the ULP. 'event' is the sctp_ulpevent for * very first SKB on the 'temp' list. */ if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_ulpq_tail_event(ulpq, &temp); } return event_eor; } /* Add a new event for propagation to the ULP. */ /* Clear the partial delivery mode for this socket. Note: This * assumes that no association is currently in partial delivery mode. */ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) { struct sctp_sock *sp = sctp_sk(sk); if (atomic_dec_and_test(&sp->pd_mode)) { /* This means there are no other associations in PD, so * we can go ahead and clear out the lobby in one shot */ if (!skb_queue_empty(&sp->pd_lobby)) { skb_queue_splice_tail_init(&sp->pd_lobby, &sk->sk_receive_queue); return 1; } } else { /* There are other associations in PD, so we only need to * pull stuff out of the lobby that belongs to the * associations that is exiting PD (all of its notifications * are posted here). */ if (!skb_queue_empty(&sp->pd_lobby) && asoc) { struct sk_buff *skb, *tmp; struct sctp_ulpevent *event; sctp_skb_for_each(skb, &sp->pd_lobby, tmp) { event = sctp_skb2event(skb); if (event->asoc == asoc) { __skb_unlink(skb, &sp->pd_lobby); __skb_queue_tail(&sk->sk_receive_queue, skb); } } } } return 0; } /* Set the pd_mode on the socket and ulpq */ static void sctp_ulpq_set_pd(struct sctp_ulpq *ulpq) { struct sctp_sock *sp = sctp_sk(ulpq->asoc->base.sk); atomic_inc(&sp->pd_mode); ulpq->pd_mode = 1; } /* Clear the pd_mode and restart any pending messages waiting for delivery. */ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) { ulpq->pd_mode = 0; sctp_ulpq_reasm_drain(ulpq); return sctp_clear_pd(ulpq->asoc->base.sk, ulpq->asoc); } int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff_head *queue; struct sk_buff *skb; int clear_pd = 0; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); /* If the socket is just going to throw this away, do not * even try to deliver it. */ if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until * partial delivery is cleared, unless, of course _this_ is * the association the cause of the partial delivery. */ if (atomic_read(&sp->pd_mode) == 0) { queue = &sk->sk_receive_queue; } else { if (ulpq->pd_mode) { /* If the association is in partial delivery, we * need to finish delivering the partially processed * packet before passing any other data. This is * because we don't truly support stream interleaving. */ if ((event->msg_flags & MSG_NOTIFICATION) || (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK))) queue = &sp->pd_lobby; else { clear_pd = event->msg_flags & MSG_EOR; queue = &sk->sk_receive_queue; } } else { /* * If fragment interleave is enabled, we * can queue this to the receive queue instead * of the lobby. */ if (sp->frag_interleave) queue = &sk->sk_receive_queue; else queue = &sp->pd_lobby; } } skb_queue_splice_tail_init(skb_list, queue); /* Did we just complete partial delivery and need to get * rolling again? Move pending data to the receive * queue. */ if (clear_pd) sctp_ulpq_clear_pd(ulpq); if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { if (!sock_owned_by_user(sk)) sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } /* 2nd Level Abstractions */ /* Helper function to store chunks that need to be reassembled. */ static void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *pos; struct sctp_ulpevent *cevent; __u32 tsn, ctsn; tsn = event->tsn; /* See if it belongs at the end. */ pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } /* Short circuit just dropping it at the end. */ cevent = sctp_skb2event(pos); ctsn = cevent->tsn; if (TSN_lt(ctsn, tsn)) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } /* Find the right place in this list. We store them by TSN. */ skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; if (TSN_lt(tsn, ctsn)) break; } /* Insert before pos. */ __skb_queue_before(&ulpq->reasm, pos, sctp_event2skb(event)); } /* Helper function to return an event corresponding to the reassembled * datagram. * This routine creates a re-assembled skb given the first and last skb's * as stored in the reassembly queue. The skb's may be non-linear if the sctp * payload was fragmented on the way and ip had to reassemble them. * We add the rest of skb's to the first skb's fraglist. */ struct sctp_ulpevent *sctp_make_reassembled_event(struct net *net, struct sk_buff_head *queue, struct sk_buff *f_frag, struct sk_buff *l_frag) { struct sk_buff *pos; struct sk_buff *new = NULL; struct sctp_ulpevent *event; struct sk_buff *pnext, *last; struct sk_buff *list = skb_shinfo(f_frag)->frag_list; /* Store the pointer to the 2nd skb */ if (f_frag == l_frag) pos = NULL; else pos = f_frag->next; /* Get the last skb in the f_frag's frag_list if present. */ for (last = list; list; last = list, list = list->next) ; /* Add the list of remaining fragments to the first fragments * frag_list. */ if (last) last->next = pos; else { if (skb_cloned(f_frag)) { /* This is a cloned skb, we can't just modify * the frag_list. We need a new skb to do that. * Instead of calling skb_unshare(), we'll do it * ourselves since we need to delay the free. */ new = skb_copy(f_frag, GFP_ATOMIC); if (!new) return NULL; /* try again later */ sctp_skb_set_owner_r(new, f_frag->sk); skb_shinfo(new)->frag_list = pos; } else skb_shinfo(f_frag)->frag_list = pos; } /* Remove the first fragment from the reassembly queue. */ __skb_unlink(f_frag, queue); /* if we did unshare, then free the old skb and re-assign */ if (new) { kfree_skb(f_frag); f_frag = new; } while (pos) { pnext = pos->next; /* Update the len and data_len fields of the first fragment. */ f_frag->len += pos->len; f_frag->data_len += pos->len; /* Remove the fragment from the reassembly queue. */ __skb_unlink(pos, queue); /* Break if we have reached the last fragment. */ if (pos == l_frag) break; pos->next = pnext; pos = pnext; } event = sctp_skb2event(f_frag); SCTP_INC_STATS(net, SCTP_MIB_REASMUSRMSGS); return event; } /* Helper function to check if an incoming chunk has filled up the last * missing fragment in a SCTP datagram and return the corresponding event. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq) { struct sk_buff *pos; struct sctp_ulpevent *cevent; struct sk_buff *first_frag = NULL; __u32 ctsn, next_tsn; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; size_t pd_len = 0; struct sctp_association *asoc; u32 pd_point; /* Initialized to 0 just to avoid compiler warning message. Will * never be used with this value. It is referenced only after it * is set when we find the first fragment of a message. */ next_tsn = 0; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for a sequence of * fragmented chunks that complete a datagram. * 'first_frag' and next_tsn are reset when we find a chunk which * is the first fragment of a datagram. Once these 2 fields are set * we expect to find the remaining middle fragments and the last * fragment in order. If not, first_frag is reset to NULL and we * start the next pass when we find another first fragment. * * There is a potential to do partial delivery if user sets * SCTP_PARTIAL_DELIVERY_POINT option. Lets count some things here * to see if can do PD. */ skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: /* If this "FIRST_FRAG" is the first * element in the queue, then count it towards * possible PD. */ if (skb_queue_is_first(&ulpq->reasm, pos)) { pd_first = pos; pd_last = pos; pd_len = pos->len; } else { pd_first = NULL; pd_last = NULL; pd_len = 0; } first_frag = pos; next_tsn = ctsn + 1; break; case SCTP_DATA_MIDDLE_FRAG: if ((first_frag) && (ctsn == next_tsn)) { next_tsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else first_frag = NULL; break; case SCTP_DATA_LAST_FRAG: if (first_frag && (ctsn == next_tsn)) goto found; else first_frag = NULL; break; } } asoc = ulpq->asoc; if (pd_first) { /* Make sure we can enter partial deliver. * We can trigger partial delivery only if framgent * interleave is set, or the socket is not already * in partial delivery. */ if (!sctp_sk(asoc->base.sk)->frag_interleave && atomic_read(&sctp_sk(asoc->base.sk)->pd_mode)) goto done; cevent = sctp_skb2event(pd_first); pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) sctp_ulpq_set_pd(ulpq); } } done: return retval; found: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; goto done; } /* Retrieve the next set of fragments of a partial message. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *last_frag, *first_frag; struct sctp_ulpevent *cevent; __u32 ctsn, next_tsn; int is_last; struct sctp_ulpevent *retval; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for the first * sequence of fragmented chunks. */ if (skb_queue_empty(&ulpq->reasm)) return NULL; last_frag = first_frag = NULL; retval = NULL; next_tsn = 0; is_last = 0; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!first_frag) return NULL; goto done; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { first_frag = pos; next_tsn = ctsn + 1; last_frag = pos; } else if (next_tsn == ctsn) { next_tsn++; last_frag = pos; } else goto done; break; case SCTP_DATA_LAST_FRAG: if (!first_frag) first_frag = pos; else if (ctsn != next_tsn) goto done; last_frag = pos; is_last = 1; goto done; default: return NULL; } } /* We have the reassembled event. There is no need to look * further. */ done: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval && is_last) retval->msg_flags |= MSG_EOR; return retval; } /* Helper function to reassemble chunks. Hold chunks on the reasm queue that * need reassembling. */ static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; /* Check if this is part of a fragmented message. */ if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_ulpq_store_reasm(ulpq, event); if (!ulpq->pd_mode) retval = sctp_ulpq_retrieve_reassembled(ulpq); else { __u32 ctsn, ctsnap; /* Do not even bother unless this is the next tsn to * be delivered. */ ctsn = event->tsn; ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map); if (TSN_lte(ctsn, ctsnap)) retval = sctp_ulpq_retrieve_partial(ulpq); } return retval; } /* Retrieve the first part (sequential fragments) for partial delivery. */ static struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) { struct sk_buff *pos, *last_frag, *first_frag; struct sctp_ulpevent *cevent; __u32 ctsn, next_tsn; struct sctp_ulpevent *retval; /* The chunks are held in the reasm queue sorted by TSN. * Walk through the queue sequentially and look for a sequence of * fragmented chunks that start a datagram. */ if (skb_queue_empty(&ulpq->reasm)) return NULL; last_frag = first_frag = NULL; retval = NULL; next_tsn = 0; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); ctsn = cevent->tsn; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!first_frag) { first_frag = pos; next_tsn = ctsn + 1; last_frag = pos; } else goto done; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) return NULL; if (ctsn == next_tsn) { next_tsn++; last_frag = pos; } else goto done; break; case SCTP_DATA_LAST_FRAG: if (!first_frag) return NULL; else goto done; break; default: return NULL; } } /* We have the reassembled event. There is no need to look * further. */ done: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); return retval; } /* * Flush out stale fragments from the reassembly queue when processing * a Forward TSN. * * RFC 3758, Section 3.6 * * After receiving and processing a FORWARD TSN, the data receiver MUST * take cautions in updating its re-assembly queue. The receiver MUST * remove any partially reassembled message, which is still missing one * or more TSNs earlier than or equal to the new cumulative TSN point. * In the event that the receiver has invoked the partial delivery API, * a notification SHOULD also be generated to inform the upper layer API * that the message being partially delivered will NOT be completed. */ void sctp_ulpq_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 fwd_tsn) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *event; __u32 tsn; if (skb_queue_empty(&ulpq->reasm)) return; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { event = sctp_skb2event(pos); tsn = event->tsn; /* Since the entire message must be abandoned by the * sender (item A3 in Section 3.5, RFC 3758), we can * free all fragments on the list that are less then * or equal to ctsn_point */ if (TSN_lte(tsn, fwd_tsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } else break; } } /* * Drain the reassembly queue. If we just cleared parted delivery, it * is possible that the reassembly queue will contain already reassembled * messages. Retrieve any such messages and give them to the user. */ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq) { struct sctp_ulpevent *event = NULL; if (skb_queue_empty(&ulpq->reasm)) return; while ((event = sctp_ulpq_retrieve_reassembled(ulpq)) != NULL) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); /* Do ordering if needed. */ if (event->msg_flags & MSG_EOR) event = sctp_ulpq_order(ulpq, event); /* Send event to the ULP. 'event' is the * sctp_ulpevent for very first SKB on the temp' list. */ if (event) sctp_ulpq_tail_event(ulpq, &temp); } } /* Helper function to gather skbs that have possibly become * ordered by an incoming chunk. */ static void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_stream *stream; __u16 sid, csid, cssn; sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *) sctp_event2skb(event)->prev; /* We are holding the chunks by stream, by SSN. */ sctp_skb_for_each(pos, &ulpq->lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; /* Have we gone too far? */ if (csid > sid) break; /* Have we not gone far enough? */ if (csid < sid) continue; if (cssn != sctp_ssn_peek(stream, in, sid)) break; /* Found it, so mark in the stream. */ sctp_ssn_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); /* Attach all gathered skbs to the event. */ __skb_queue_tail(event_list, pos); } } /* Helper function to store chunks needing ordering. */ static void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *pos; struct sctp_ulpevent *cevent; __u16 sid, csid; __u16 ssn, cssn; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } sid = event->stream; ssn = event->ssn; cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (sid > csid) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if ((sid == csid) && SSN_lt(cssn, ssn)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } /* Find the right place in this list. We store them by * stream ID and then by SSN. */ skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (csid > sid) break; if (csid == sid && SSN_lt(ssn, cssn)) break; } /* Insert before pos. */ __skb_queue_before(&ulpq->lobby, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { __u16 sid, ssn; struct sctp_stream *stream; /* Check if this message needs ordering. */ if (event->msg_flags & SCTP_DATA_UNORDERED) return event; /* Note: The stream ID must be verified before this routine. */ sid = event->stream; ssn = event->ssn; stream = &ulpq->asoc->stream; /* Is this the expected SSN for this stream ID? */ if (ssn != sctp_ssn_peek(stream, in, sid)) { /* We've received something out of order, so find where it * needs to be placed. We order by stream and then by SSN. */ sctp_ulpq_store_ordered(ulpq, event); return NULL; } /* Mark that the next chunk has been found. */ sctp_ssn_next(stream, in, sid); /* Go find any other chunks that were waiting for * ordering. */ sctp_ulpq_retrieve_ordered(ulpq, event); return event; } /* Helper function to gather skbs that have possibly become * ordered by forward tsn skipping their dependencies. */ static void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sk_buff *pos, *tmp; struct sctp_ulpevent *cevent; struct sctp_ulpevent *event; struct sctp_stream *stream; struct sk_buff_head temp; struct sk_buff_head *lobby = &ulpq->lobby; __u16 csid, cssn; stream = &ulpq->asoc->stream; /* We are holding the chunks by stream, by SSN. */ skb_queue_head_init(&temp); event = NULL; sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; /* Have we gone too far? */ if (csid > sid) break; /* Have we not gone far enough? */ if (csid < sid) continue; /* see if this ssn has been marked by skipping */ if (!SSN_lt(cssn, sctp_ssn_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) /* Create a temporary list to collect chunks on. */ event = sctp_skb2event(pos); /* Attach all gathered skbs to the event. */ __skb_queue_tail(&temp, pos); } /* If we didn't reap any data, see if the next expected SSN * is next on the queue and if so, use that. */ if (event == NULL && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *) pos->cb; csid = cevent->stream; cssn = cevent->ssn; if (csid == sid && cssn == sctp_ssn_peek(stream, in, csid)) { sctp_ssn_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } /* Send event to the ULP. 'event' is the sctp_ulpevent for * very first SKB on the 'temp' list. */ if (event) { /* see if we have more ordered that we can deliver */ sctp_ulpq_retrieve_ordered(ulpq, event); sctp_ulpq_tail_event(ulpq, &temp); } } /* Skip over an SSN. This is used during the processing of * Forwared TSN chunk to skip over the abandoned ordered data */ void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) { struct sctp_stream *stream; /* Note: The stream ID must be verified before this routine. */ stream = &ulpq->asoc->stream; /* Is this an old SSN? If so ignore. */ if (SSN_lt(ssn, sctp_ssn_peek(stream, in, sid))) return; /* Mark that we are no longer expecting this SSN or lower. */ sctp_ssn_skip(stream, in, sid, ssn); /* Go find any other chunks that were waiting for * ordering and deliver them if needed. */ sctp_ulpq_reap_ordered(ulpq, sid); } __u16 sctp_ulpq_renege_list(struct sctp_ulpq *ulpq, struct sk_buff_head *list, __u16 needed) { __u16 freed = 0; __u32 tsn, last_tsn; struct sk_buff *skb, *flist, *last; struct sctp_ulpevent *event; struct sctp_tsnmap *tsnmap; tsnmap = &ulpq->asoc->peer.tsn_map; while ((skb = skb_peek_tail(list)) != NULL) { event = sctp_skb2event(skb); tsn = event->tsn; /* Don't renege below the Cumulative TSN ACK Point. */ if (TSN_lte(tsn, sctp_tsnmap_get_ctsn(tsnmap))) break; /* Events in ordering queue may have multiple fragments * corresponding to additional TSNs. Sum the total * freed space; find the last TSN. */ freed += skb_headlen(skb); flist = skb_shinfo(skb)->frag_list; for (last = flist; flist; flist = flist->next) { last = flist; freed += skb_headlen(last); } if (last) last_tsn = sctp_skb2event(last)->tsn; else last_tsn = tsn; /* Unlink the event, then renege all applicable TSNs. */ __skb_unlink(skb, list); sctp_ulpevent_free(event); while (TSN_lte(tsn, last_tsn)) { sctp_tsnmap_renege(tsnmap, tsn); tsn++; } if (freed >= needed) return freed; } return freed; } /* Renege 'needed' bytes from the ordering queue. */ static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed) { return sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); } /* Renege 'needed' bytes from the reassembly queue. */ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed) { return sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); } /* Partial deliver the first message as there is pressure on rwnd. */ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sctp_association *asoc; struct sctp_sock *sp; __u32 ctsn; struct sk_buff *skb; asoc = ulpq->asoc; sp = sctp_sk(asoc->base.sk); /* If the association is already in Partial Delivery mode * we have nothing to do. */ if (ulpq->pd_mode) return; /* Data must be at or below the Cumulative TSN ACK Point to * start partial delivery. */ skb = skb_peek(&asoc->ulpq.reasm); if (skb != NULL) { ctsn = sctp_skb2event(skb)->tsn; if (!TSN_lte(ctsn, sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map))) return; } /* If the user enabled fragment interleave socket option, * multiple associations can enter partial delivery. * Otherwise, we can only enter partial delivery if the * socket is not in partial deliver mode. */ if (sp->frag_interleave || atomic_read(&sp->pd_mode) == 0) { /* Is partial delivery possible? */ event = sctp_ulpq_retrieve_first(ulpq); /* Send event to the ULP. */ if (event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_ulpq_tail_event(ulpq, &temp); sctp_ulpq_set_pd(ulpq); return; } } } /* Renege some packets to make room for an incoming chunk. */ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_data_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_order(ulpq, needed); if (freed < needed) freed += sctp_ulpq_renege_frags(ulpq, needed - freed); } /* If able to free enough room, accept this chunk. */ if (sk_rmem_schedule(asoc->base.sk, chunk->skb, needed) && freed >= needed) { int retval = sctp_ulpq_tail_data(ulpq, chunk, gfp); /* * Enter partial delivery if chunk has not been * delivered; otherwise, drain the reassembly queue. */ if (retval <= 0) sctp_ulpq_partial_delivery(ulpq, gfp); else if (retval == 1) sctp_ulpq_reasm_drain(ulpq); } } /* Notify the application if an association is aborted and in * partial delivery mode. Send up any pending received messages. */ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; struct sctp_sock *sp; struct sock *sk; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; sp = sctp_sk(sk); if (sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, 0, 0, 0, gfp); if (ev) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); /* If there is data waiting, send it up the socket now. */ if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } |
10 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * File: af_phonet.h * * Phonet sockets kernel definitions * * Copyright (C) 2008 Nokia Corporation. */ #ifndef AF_PHONET_H #define AF_PHONET_H #include <linux/phonet.h> #include <linux/skbuff.h> #include <net/sock.h> /* * The lower layers may not require more space, ever. Make sure it's * enough. */ #define MAX_PHONET_HEADER (8 + MAX_HEADER) /* * Every Phonet* socket has this structure first in its * protocol-specific structure under name c. */ struct pn_sock { struct sock sk; u16 sobject; u16 dobject; u8 resource; }; static inline struct pn_sock *pn_sk(struct sock *sk) { return (struct pn_sock *)sk; } extern const struct proto_ops phonet_dgram_ops; void pn_sock_init(void); struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa); void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb); void phonet_get_local_port_range(int *min, int *max); int pn_sock_hash(struct sock *sk); void pn_sock_unhash(struct sock *sk); int pn_sock_get_port(struct sock *sk, unsigned short sport); struct sock *pn_find_sock_by_res(struct net *net, u8 res); int pn_sock_bind_res(struct sock *sock, u8 res); int pn_sock_unbind_res(struct sock *sk, u8 res); void pn_sock_unbind_all_res(struct sock *sk); int pn_skb_send(struct sock *sk, struct sk_buff *skb, const struct sockaddr_pn *target); static inline struct phonethdr *pn_hdr(struct sk_buff *skb) { return (struct phonethdr *)skb_network_header(skb); } static inline struct phonetmsg *pn_msg(struct sk_buff *skb) { return (struct phonetmsg *)skb_transport_header(skb); } /* * Get the other party's sockaddr from received skb. The skb begins * with a Phonet header. */ static inline void pn_skb_get_src_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_sdev, ph->pn_sobj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } static inline void pn_skb_get_dst_sockaddr(struct sk_buff *skb, struct sockaddr_pn *sa) { struct phonethdr *ph = pn_hdr(skb); u16 obj = pn_object(ph->pn_rdev, ph->pn_robj); sa->spn_family = AF_PHONET; pn_sockaddr_set_object(sa, obj); pn_sockaddr_set_resource(sa, ph->pn_res); memset(sa->spn_zero, 0, sizeof(sa->spn_zero)); } /* Protocols in Phonet protocol family. */ struct phonet_protocol { const struct proto_ops *ops; struct proto *prot; int sock_type; }; int phonet_proto_register(unsigned int protocol, const struct phonet_protocol *pp); void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); int isi_register(void); void isi_unregister(void); static inline bool sk_is_phonet(struct sock *sk) { return sk->sk_family == PF_PHONET; } static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { int karg; switch (cmd) { case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: if (get_user(karg, (int __user *)arg)) return -EFAULT; return sk->sk_prot->ioctl(sk, cmd, &karg); } /* A positive return value means that the ioctl was not processed */ return 1; } #endif |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __CPUSET_INTERNAL_H #define __CPUSET_INTERNAL_H #include <linux/cgroup.h> #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/spinlock.h> #include <linux/union_find.h> /* See "Frequency meter" comments, below. */ struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; /* * Invalid partition error code */ enum prs_errcode { PERR_NONE = 0, PERR_INVCPUS, PERR_INVPARENT, PERR_NOTPART, PERR_NOTEXCL, PERR_NOCPUS, PERR_HOTPLUG, PERR_CPUSEMPTY, PERR_HKEEPING, PERR_ACCESS, }; /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, } cpuset_flagbits_t; /* The various types of files and directories in a cpuset file system */ typedef enum { FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_EFFECTIVE_CPULIST, FILE_EFFECTIVE_MEMLIST, FILE_SUBPARTS_CPULIST, FILE_EXCLUSIVE_CPULIST, FILE_EFFECTIVE_XCPULIST, FILE_ISOLATED_CPULIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_MEM_HARDWALL, FILE_SCHED_LOAD_BALANCE, FILE_PARTITION_ROOT, FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, } cpuset_filetype_t; struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ /* * On default hierarchy: * * The user-configured masks can only be changed by writing to * cpuset.cpus and cpuset.mems, and won't be limited by the * parent masks. * * The effective masks is the real masks that apply to the tasks * in the cpuset. They may be changed if the configured masks are * changed or hotplug happens. * * effective_mask == configured_mask & parent's effective_mask, * and if it ends up empty, it will inherit the parent's mask. * * * On legacy hierarchy: * * The user-configured masks are always the same with effective masks. */ /* user-configured CPUs and Memory Nodes allow to tasks */ cpumask_var_t cpus_allowed; nodemask_t mems_allowed; /* effective CPUs and Memory Nodes allow to tasks */ cpumask_var_t effective_cpus; nodemask_t effective_mems; /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * * The effective_cpus of a valid partition root comes solely from its * effective_xcpus and some of the effective_xcpus may be distributed * to sub-partitions below & hence excluded from its effective_cpus. * For a valid partition root, its effective_cpus have no relationship * with cpus_allowed unless its exclusive_cpus isn't set. * * This value will only be set if either exclusive_cpus is set or * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) * * Its value is independent of cpus_allowed and designates the set of * CPUs that can be granted to the current cpuset or its children when * it becomes a valid partition root. The effective set of exclusive * CPUs granted (effective_xcpus) depends on whether those exclusive * CPUs are passed down by its ancestors and not yet taken up by * another sibling partition root along the way. * * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; /* * This is old Memory Nodes tasks took on. * * - top_cpuset.old_mems_allowed is initialized to mems_allowed. * - A new cpuset's old_mems_allowed is initialized when some * task is moved into it. * - old_mems_allowed is used in cpuset_migrate_mm() when we change * cpuset.mems_allowed and have tasks' nodemask updated, and * then old_mems_allowed is updated to mems_allowed. */ nodemask_t old_mems_allowed; struct fmeter fmeter; /* memory_pressure filter */ /* * Tasks are being attached to this cpuset. Used to prevent * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). */ int attach_in_progress; /* for custom sched domain */ int relax_domain_level; /* number of valid local child partitions */ int nr_subparts; /* partition root state */ int partition_root_state; /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. */ int nr_deadline_tasks; int nr_migrate_dl_tasks; u64 sum_migrate_dl_bw; /* Invalid partition error code, not lock protected */ enum prs_errcode prs_err; /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; /* Remote partition silbling list anchored at remote_children */ struct list_head remote_sibling; /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cpuset, css) : NULL; } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) { return css_cs(cs->css.parent); } /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) { return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); } static inline int is_mem_exclusive(const struct cpuset *cs) { return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); } static inline int is_mem_hardwall(const struct cpuset *cs) { return test_bit(CS_MEM_HARDWALL, &cs->flags); } static inline int is_sched_load_balance(const struct cpuset *cs) { return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } static inline int is_memory_migrate(const struct cpuset *cs) { return test_bit(CS_MEMORY_MIGRATE, &cs->flags); } static inline int is_spread_page(const struct cpuset *cs) { return test_bit(CS_SPREAD_PAGE, &cs->flags); } static inline int is_spread_slab(const struct cpuset *cs) { return test_bit(CS_SPREAD_SLAB, &cs->flags); } /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ css_for_each_child((pos_css), &(parent_cs)->css) \ if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling * css_rightmost_descendant() to skip subtree. @root_cs is included in the * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) void rebuild_sched_domains_locked(void); void cpuset_callback_lock_irq(void); void cpuset_callback_unlock_irq(void); void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus); void cpuset_update_tasks_nodemask(struct cpuset *cs); int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int cpuset_common_seq_show(struct seq_file *sf, void *v); /* * cpuset-v1.c */ #ifdef CONFIG_CPUSETS_V1 extern struct cftype cpuset1_files[]; void fmeter_init(struct fmeter *fmp); void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk); void cpuset1_update_tasks_flags(struct cpuset *cs); void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated); int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial); #else static inline void fmeter_init(struct fmeter *fmp) {} static inline void cpuset1_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) {} static inline void cpuset1_update_tasks_flags(struct cpuset *cs) {} static inline void cpuset1_hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) {} static inline int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial) { return 0; } #endif /* CONFIG_CPUSETS_V1 */ #endif /* __CPUSET_INTERNAL_H */ |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C)2006 USAGI/WIDE Project * * Author: * Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com> * * Based on net/netfilter/xt_tcpudp.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <net/ipv6.h> #include <net/mip6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6t_mh.h> MODULE_DESCRIPTION("Xtables: IPv6 Mobility Header match"); MODULE_LICENSE("GPL"); /* Returns 1 if the type is matched by the range, 0 otherwise */ static inline bool type_match(u_int8_t min, u_int8_t max, u_int8_t type, bool invert) { return (type >= min && type <= max) ^ invert; } static bool mh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ip6_mh _mh; const struct ip6_mh *mh; const struct ip6t_mh *mhinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; mh = skb_header_pointer(skb, par->thoff, sizeof(_mh), &_mh); if (mh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil MH tinygram.\n"); par->hotdrop = true; return false; } if (mh->ip6mh_proto != IPPROTO_NONE) { pr_debug("Dropping invalid MH Payload Proto: %u\n", mh->ip6mh_proto); par->hotdrop = true; return false; } return type_match(mhinfo->types[0], mhinfo->types[1], mh->ip6mh_type, !!(mhinfo->invflags & IP6T_MH_INV_TYPE)); } static int mh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_mh *mhinfo = par->matchinfo; /* Must specify no unknown invflags */ return (mhinfo->invflags & ~IP6T_MH_INV_MASK) ? -EINVAL : 0; } static struct xt_match mh_mt6_reg __read_mostly = { .name = "mh", .family = NFPROTO_IPV6, .checkentry = mh_mt6_check, .match = mh_mt6, .matchsize = sizeof(struct ip6t_mh), .proto = IPPROTO_MH, .me = THIS_MODULE, }; static int __init mh_mt6_init(void) { return xt_register_match(&mh_mt6_reg); } static void __exit mh_mt6_exit(void) { xt_unregister_match(&mh_mt6_reg); } module_init(mh_mt6_init); module_exit(mh_mt6_exit); |
4 4 3 2 1 1 3 1 1 1 2 1 12 12 1 2 6 6 9 9 9 9 9 9 9 16 16 15 16 10 16 3 2 1 1 14 1 1 5 1 3 4 1 3 1 14 1 8 7 5 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/arp.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/init.h> #include <linux/spinlock.h> #include <net/netrom.h> #include <linux/seq_file.h> #include <linux/export.h> static unsigned int nr_neigh_no = 1; static HLIST_HEAD(nr_node_list); static DEFINE_SPINLOCK(nr_node_list_lock); static HLIST_HEAD(nr_neigh_list); static DEFINE_SPINLOCK(nr_neigh_list_lock); static struct nr_node *nr_node_get(ax25_address *callsign) { struct nr_node *found = NULL; struct nr_node *nr_node; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) if (ax25cmp(callsign, &nr_node->callsign) == 0) { nr_node_hold(nr_node); found = nr_node; break; } spin_unlock_bh(&nr_node_list_lock); return found; } static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, struct net_device *dev) { struct nr_neigh *found = NULL; struct nr_neigh *nr_neigh; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(nr_neigh, &nr_neigh_list) if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && nr_neigh->dev == dev) { nr_neigh_hold(nr_neigh); found = nr_neigh; break; } spin_unlock_bh(&nr_neigh_list_lock); return found; } static void nr_remove_neigh(struct nr_neigh *); /* re-sort the routes in quality order. */ static void re_sort_routes(struct nr_node *nr_node, int x, int y) { if (nr_node->routes[y].quality > nr_node->routes[x].quality) { if (nr_node->which == x) nr_node->which = y; else if (nr_node->which == y) nr_node->which = x; swap(nr_node->routes[x], nr_node->routes[y]); } } /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. */ static int __must_check nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax25, ax25_digi *ax25_digi, struct net_device *dev, int quality, int obs_count) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i, found; struct net_device *odev; if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */ dev_put(odev); return -EINVAL; } nr_node = nr_node_get(nr); nr_neigh = nr_neigh_get_dev(ax25, dev); /* * The L2 link to a neighbour has failed in the past * and now a frame comes from this neighbour. We assume * it was a temporary trouble with the link and reset the * routes now (and not wait for a node broadcast). */ if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { struct nr_node *nr_nodet; spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_nodet, &nr_node_list) { nr_node_lock(nr_nodet); for (i = 0; i < nr_nodet->count; i++) if (nr_nodet->routes[i].neighbour == nr_neigh) if (i < nr_nodet->which) nr_nodet->which = i; nr_node_unlock(nr_nodet); } spin_unlock_bh(&nr_node_list_lock); } if (nr_neigh != NULL) nr_neigh->failed = 0; if (quality == 0 && nr_neigh != NULL && nr_node != NULL) { nr_neigh_put(nr_neigh); nr_node_put(nr_node); return 0; } if (nr_neigh == NULL) { if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) { if (nr_node) nr_node_put(nr_node); return -ENOMEM; } nr_neigh->callsign = *ax25; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = READ_ONCE(sysctl_netrom_default_path_quality); nr_neigh->locked = 0; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); if (nr_node) nr_node_put(nr_node); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); nr_neigh_hold(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked) nr_neigh->quality = quality; if (nr_node == NULL) { if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) { if (nr_neigh) nr_neigh_put(nr_neigh); return -ENOMEM; } nr_node->callsign = *nr; strscpy(nr_node->mnemonic, mnemonic); nr_node->which = 0; nr_node->count = 1; refcount_set(&nr_node->refcount, 1); spin_lock_init(&nr_node->node_lock); nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; spin_lock_bh(&nr_node_list_lock); hlist_add_head(&nr_node->node_node, &nr_node_list); /* refcount initialized at 1 */ spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); return 0; } nr_node_lock(nr_node); if (quality != 0) strscpy(nr_node->mnemonic, mnemonic); for (found = 0, i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_node->routes[i].quality = quality; nr_node->routes[i].obs_count = obs_count; found = 1; break; } } if (!found) { /* We have space at the bottom, slot it in */ if (nr_node->count < 3) { nr_node->routes[2] = nr_node->routes[1]; nr_node->routes[1] = nr_node->routes[0]; nr_node->routes[0].quality = quality; nr_node->routes[0].obs_count = obs_count; nr_node->routes[0].neighbour = nr_neigh; nr_node->which++; nr_node->count++; nr_neigh_hold(nr_neigh); nr_neigh->count++; } else { /* It must be better than the worst */ if (quality > nr_node->routes[2].quality) { nr_node->routes[2].neighbour->count--; nr_neigh_put(nr_node->routes[2].neighbour); if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) nr_remove_neigh(nr_node->routes[2].neighbour); nr_node->routes[2].quality = quality; nr_node->routes[2].obs_count = obs_count; nr_node->routes[2].neighbour = nr_neigh; nr_neigh_hold(nr_neigh); nr_neigh->count++; } } } /* Now re-sort the routes in quality order */ switch (nr_node->count) { case 3: re_sort_routes(nr_node, 0, 1); re_sort_routes(nr_node, 1, 2); fallthrough; case 2: re_sort_routes(nr_node, 0, 1); break; case 1: break; } for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { if (i < nr_node->which) nr_node->which = i; break; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } static void nr_remove_node_locked(struct nr_node *nr_node) { lockdep_assert_held(&nr_node_list_lock); hlist_del_init(&nr_node->node_node); nr_node_put(nr_node); } static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) { hlist_del_init(&nr_neigh->neigh_node); nr_neigh_put(nr_neigh); } #define nr_remove_neigh_locked(__neigh) \ __nr_remove_neigh(__neigh) static void nr_remove_neigh(struct nr_neigh *nr_neigh) { spin_lock_bh(&nr_neigh_list_lock); __nr_remove_neigh(nr_neigh); spin_unlock_bh(&nr_neigh_list_lock); } /* * "Delete" a node. Strictly speaking remove a route to a node. The node * is only deleted if no routes are left to it. */ static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev) { struct nr_node *nr_node; struct nr_neigh *nr_neigh; int i; nr_node = nr_node_get(callsign); if (nr_node == NULL) return -EINVAL; nr_neigh = nr_neigh_get_dev(neighbour, dev); if (nr_neigh == NULL) { nr_node_put(nr_node); return -EINVAL; } spin_lock_bh(&nr_node_list_lock); nr_node_lock(nr_node); for (i = 0; i < nr_node->count; i++) { if (nr_node->routes[i].neighbour == nr_neigh) { nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); nr_node->count--; if (nr_node->count == 0) { nr_remove_node_locked(nr_node); } else { switch (i) { case 0: nr_node->routes[0] = nr_node->routes[1]; fallthrough; case 1: nr_node->routes[1] = nr_node->routes[2]; fallthrough; case 2: break; } nr_node_put(nr_node); } nr_node_unlock(nr_node); spin_unlock_bh(&nr_node_list_lock); return 0; } } nr_neigh_put(nr_neigh); nr_node_unlock(nr_node); spin_unlock_bh(&nr_node_list_lock); nr_node_put(nr_node); return -EINVAL; } /* * Lock a neighbour with a quality. */ static int __must_check nr_add_neigh(ax25_address *callsign, ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh) { nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh_put(nr_neigh); return 0; } if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) return -ENOMEM; nr_neigh->callsign = *callsign; nr_neigh->digipeat = NULL; nr_neigh->ax25 = NULL; nr_neigh->dev = dev; nr_neigh->quality = quality; nr_neigh->locked = 1; nr_neigh->count = 0; nr_neigh->number = nr_neigh_no++; nr_neigh->failed = 0; refcount_set(&nr_neigh->refcount, 1); if (ax25_digi != NULL && ax25_digi->ndigi > 0) { nr_neigh->digipeat = kmemdup(ax25_digi, sizeof(*ax25_digi), GFP_KERNEL); if (nr_neigh->digipeat == NULL) { kfree(nr_neigh); return -ENOMEM; } } spin_lock_bh(&nr_neigh_list_lock); hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); /* refcount is initialized at 1 */ spin_unlock_bh(&nr_neigh_list_lock); return 0; } /* * "Delete" a neighbour. The neighbour is only removed if the number * of nodes that may use it is zero. */ static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality) { struct nr_neigh *nr_neigh; nr_neigh = nr_neigh_get_dev(callsign, dev); if (nr_neigh == NULL) return -EINVAL; nr_neigh->quality = quality; nr_neigh->locked = 0; if (nr_neigh->count == 0) nr_remove_neigh(nr_neigh); nr_neigh_put(nr_neigh); return 0; } /* * Decrement the obsolescence count by one. If a route is reduced to a * count of zero, remove it. Also remove any unlocked neighbours with * zero nodes routing via it. */ static int nr_dec_obs(void) { struct nr_neigh *nr_neigh; struct nr_node *s; struct hlist_node *nodet; int i; spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(s, nodet, &nr_node_list) { nr_node_lock(s); for (i = 0; i < s->count; i++) { switch (s->routes[i].obs_count) { case 0: /* A locked entry */ break; case 1: /* From 1 -> 0 */ nr_neigh = s->routes[i].neighbour; nr_neigh->count--; nr_neigh_put(nr_neigh); if (nr_neigh->count == 0 && !nr_neigh->locked) nr_remove_neigh(nr_neigh); s->count--; switch (i) { case 0: s->routes[0] = s->routes[1]; fallthrough; case 1: s->routes[1] = s->routes[2]; break; case 2: break; } break; default: s->routes[i].obs_count--; break; } } if (s->count <= 0) nr_remove_node_locked(s); nr_node_unlock(s); } spin_unlock_bh(&nr_node_list_lock); return 0; } /* * A device has been removed. Remove its routes and neighbours. */ void nr_rt_device_down(struct net_device *dev) { struct nr_neigh *s; struct hlist_node *nodet, *node2t; struct nr_node *t; int i; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { if (s->dev == dev) { spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, node2t, &nr_node_list) { nr_node_lock(t); for (i = 0; i < t->count; i++) { if (t->routes[i].neighbour == s) { t->count--; switch (i) { case 0: t->routes[0] = t->routes[1]; fallthrough; case 1: t->routes[1] = t->routes[2]; break; case 2: break; } } } if (t->count <= 0) nr_remove_node_locked(t); nr_node_unlock(t); } spin_unlock_bh(&nr_node_list_lock); nr_remove_neigh_locked(s); } } spin_unlock_bh(&nr_neigh_list_lock); } /* * Check that the device given is a valid AX.25 interface that is "up". * Or a valid ethernet interface with an AX.25 callsign binding. */ static struct net_device *nr_ax25_dev_get(char *devname) { struct net_device *dev; if ((dev = dev_get_by_name(&init_net, devname)) == NULL) return NULL; if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; dev_put(dev); return NULL; } /* * Find the first active NET/ROM device, usually "nr0". */ struct net_device *nr_dev_first(void) { struct net_device *dev, *first = NULL; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } dev_hold(first); rcu_read_unlock(); return first; } /* * Find the NET/ROM device for the given callsign. */ struct net_device *nr_dev_get(ax25_address *addr) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (const ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } dev = NULL; out: rcu_read_unlock(); return dev; } static ax25_digi *nr_call_to_digi(ax25_digi *digi, int ndigis, ax25_address *digipeaters) { int i; if (ndigis == 0) return NULL; for (i = 0; i < ndigis; i++) { digi->calls[i] = digipeaters[i]; digi->repeated[i] = 0; } digi->ndigi = ndigis; digi->lastrepeat = -1; return digi; } /* * Handle the ioctls that control the routing functions. */ int nr_rt_ioctl(unsigned int cmd, void __user *arg) { struct nr_route_struct nr_route; struct net_device *dev; ax25_digi digi; int ret; switch (cmd) { case SIOCADDRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if (nr_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: if (strnlen(nr_route.mnemonic, 7) == 7) { ret = -EINVAL; break; } ret = nr_add_node(&nr_route.callsign, nr_route.mnemonic, &nr_route.neighbour, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality, nr_route.obs_count); break; case NETROM_NEIGH: ret = nr_add_neigh(&nr_route.callsign, nr_call_to_digi(&digi, nr_route.ndigis, nr_route.digipeaters), dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCDELRT: if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) return -EFAULT; if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) return -EINVAL; switch (nr_route.type) { case NETROM_NODE: ret = nr_del_node(&nr_route.callsign, &nr_route.neighbour, dev); break; case NETROM_NEIGH: ret = nr_del_neigh(&nr_route.callsign, dev, nr_route.quality); break; default: ret = -EINVAL; } dev_put(dev); return ret; case SIOCNRDECOBS: return nr_dec_obs(); default: return -EINVAL; } return 0; } /* * A level 2 link has timed out, therefore it appears to be a poor link, * then don't use that neighbour until it is reset. */ void nr_link_failed(ax25_cb *ax25, int reason) { struct nr_neigh *s, *nr_neigh = NULL; struct nr_node *nr_node = NULL; spin_lock_bh(&nr_neigh_list_lock); nr_neigh_for_each(s, &nr_neigh_list) { if (s->ax25 == ax25) { nr_neigh_hold(s); nr_neigh = s; break; } } spin_unlock_bh(&nr_neigh_list_lock); if (nr_neigh == NULL) return; nr_neigh->ax25 = NULL; ax25_cb_put(ax25); if (++nr_neigh->failed < READ_ONCE(sysctl_netrom_link_fails_count)) { nr_neigh_put(nr_neigh); return; } spin_lock_bh(&nr_node_list_lock); nr_node_for_each(nr_node, &nr_node_list) { nr_node_lock(nr_node); if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) nr_node->which++; nr_node_unlock(nr_node); } spin_unlock_bh(&nr_node_list_lock); nr_neigh_put(nr_neigh); } /* * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb * indicates an internally generated frame. */ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) { ax25_address *nr_src, *nr_dest; struct nr_neigh *nr_neigh; struct nr_node *nr_node; struct net_device *dev; unsigned char *dptr; ax25_cb *ax25s; int ret; struct sk_buff *skbn; /* * Reject malformed packets early. Check that it contains at least 2 * addresses and 1 byte more for Time-To-Live */ if (skb->len < 2 * sizeof(ax25_address) + 1) return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); if (ax25 != NULL) { ret = nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, ax25->ax25_dev->dev, 0, READ_ONCE(sysctl_netrom_obsolescence_count_initialiser)); if (ret) return ret; } if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ if (ax25 == NULL) /* Its from me */ ret = nr_loopback_queue(skb); else ret = nr_rx_frame(skb, dev); dev_put(dev); return ret; } if (!READ_ONCE(sysctl_netrom_routing_control) && ax25 != NULL) return 0; /* Its Time-To-Live has expired */ if (skb->data[14] == 1) { return 0; } nr_node = nr_node_get(nr_dest); if (nr_node == NULL) return 0; nr_node_lock(nr_node); if (nr_node->which >= nr_node->count) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } nr_neigh = nr_node->routes[nr_node->which].neighbour; if ((dev = nr_dev_first()) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); return 0; } /* We are going to change the netrom headers so we should get our own skb, we also did not know until now how much header space we had to reserve... - RXQ */ if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { nr_node_unlock(nr_node); nr_node_put(nr_node); dev_put(dev); return 0; } kfree_skb(skb); skb=skbn; skb->data[14]--; dptr = skb_push(skb, 1); *dptr = AX25_P_NETROM; ax25s = nr_neigh->ax25; nr_neigh->ax25 = ax25_send_frame(skb, 256, (const ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); if (ax25s) ax25_cb_put(ax25s); dev_put(dev); ret = (nr_neigh->ax25 != NULL); nr_node_unlock(nr_node); nr_node_put(nr_node); return ret; } #ifdef CONFIG_PROC_FS static void *nr_node_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_node_list_lock) { spin_lock_bh(&nr_node_list_lock); return seq_hlist_start_head(&nr_node_list, *pos); } static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_node_list, pos); } static void nr_node_stop(struct seq_file *seq, void *v) __releases(&nr_node_list_lock) { spin_unlock_bh(&nr_node_list_lock); } static int nr_node_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); else { struct nr_node *nr_node = hlist_entry(v, struct nr_node, node_node); nr_node_lock(nr_node); seq_printf(seq, "%-9s %-7s %d %d", ax2asc(buf, &nr_node->callsign), (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, nr_node->which + 1, nr_node->count); for (i = 0; i < nr_node->count; i++) { seq_printf(seq, " %3d %d %05d", nr_node->routes[i].quality, nr_node->routes[i].obs_count, nr_node->routes[i].neighbour->number); } nr_node_unlock(nr_node); seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_node_seqops = { .start = nr_node_start, .next = nr_node_next, .stop = nr_node_stop, .show = nr_node_show, }; static void *nr_neigh_start(struct seq_file *seq, loff_t *pos) __acquires(&nr_neigh_list_lock) { spin_lock_bh(&nr_neigh_list_lock); return seq_hlist_start_head(&nr_neigh_list, *pos); } static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &nr_neigh_list, pos); } static void nr_neigh_stop(struct seq_file *seq, void *v) __releases(&nr_neigh_list_lock) { spin_unlock_bh(&nr_neigh_list_lock); } static int nr_neigh_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n"); else { struct nr_neigh *nr_neigh; nr_neigh = hlist_entry(v, struct nr_neigh, neigh_node); seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d", nr_neigh->number, ax2asc(buf, &nr_neigh->callsign), nr_neigh->dev ? nr_neigh->dev->name : "???", nr_neigh->quality, nr_neigh->locked, nr_neigh->count, nr_neigh->failed); if (nr_neigh->digipeat != NULL) { for (i = 0; i < nr_neigh->digipeat->ndigi; i++) seq_printf(seq, " %s", ax2asc(buf, &nr_neigh->digipeat->calls[i])); } seq_puts(seq, "\n"); } return 0; } const struct seq_operations nr_neigh_seqops = { .start = nr_neigh_start, .next = nr_neigh_next, .stop = nr_neigh_stop, .show = nr_neigh_show, }; #endif /* * Free all memory associated with the nodes and routes lists. */ void nr_rt_free(void) { struct nr_neigh *s = NULL; struct nr_node *t = NULL; struct hlist_node *nodet; spin_lock_bh(&nr_neigh_list_lock); spin_lock_bh(&nr_node_list_lock); nr_node_for_each_safe(t, nodet, &nr_node_list) { nr_node_lock(t); nr_remove_node_locked(t); nr_node_unlock(t); } nr_neigh_for_each_safe(s, nodet, &nr_neigh_list) { while(s->count) { s->count--; nr_neigh_put(s); } nr_remove_neigh_locked(s); } spin_unlock_bh(&nr_node_list_lock); spin_unlock_bh(&nr_neigh_list_lock); } |
27 27 102 30 2 30 30 30 29 5 5 5 5 5 1 1 5 5 5 5 5 28 27 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp tsn mapping array. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Jon Grimm <jgrimm@us.ibm.com> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/bitmap.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static void sctp_tsnmap_update(struct sctp_tsnmap *map); static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end); static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size); /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, __u32 initial_tsn, gfp_t gfp) { if (!map->tsn_map) { map->tsn_map = kzalloc(len>>3, gfp); if (map->tsn_map == NULL) return NULL; map->len = len; } else { bitmap_zero(map->tsn_map, map->len); } /* Keep track of TSNs represented by tsn_map. */ map->base_tsn = initial_tsn; map->cumulative_tsn_ack_point = initial_tsn - 1; map->max_tsn_seen = map->cumulative_tsn_ack_point; map->num_dup_tsns = 0; return map; } void sctp_tsnmap_free(struct sctp_tsnmap *map) { map->len = 0; kfree(map->tsn_map); } /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) { u32 gap; /* Check to see if this is an old TSN */ if (TSN_lte(tsn, map->cumulative_tsn_ack_point)) return 1; /* Verify that we can hold this TSN and that it will not * overflow our map */ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return -1; /* Calculate the index into the mapping arrays. */ gap = tsn - map->base_tsn; /* Check to see if TSN has already been recorded. */ if (gap < map->len && test_bit(gap, map->tsn_map)) return 1; else return 0; } /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn, struct sctp_transport *trans) { u16 gap; if (TSN_lt(tsn, map->base_tsn)) return 0; gap = tsn - map->base_tsn; if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1)) return -ENOMEM; if (!sctp_tsnmap_has_gap(map) && gap == 0) { /* In this case the map has no gaps and the tsn we are * recording is the next expected tsn. We don't touch * the map but simply bump the values. */ map->max_tsn_seen++; map->cumulative_tsn_ack_point++; if (trans) trans->sack_generation = trans->asoc->peer.sack_generation; map->base_tsn++; } else { /* Either we already have a gap, or about to record a gap, so * have work to do. * * Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; /* Mark the TSN as received. */ set_bit(gap, map->tsn_map); /* Go fixup any internal TSN mapping variables including * cumulative_tsn_ack_point. */ sctp_tsnmap_update(map); } return 0; } /* Initialize a Gap Ack Block iterator from memory being provided. */ static void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter) { /* Only start looking one past the Cumulative TSN Ack Point. */ iter->start = map->cumulative_tsn_ack_point + 1; } /* Get the next Gap Ack Blocks. Returns 0 if there was not another block * to get. */ static int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter, __u16 *start, __u16 *end) { int ended = 0; __u16 start_ = 0, end_ = 0, offset; /* If there are no more gap acks possible, get out fast. */ if (TSN_lte(map->max_tsn_seen, iter->start)) return 0; offset = iter->start - map->base_tsn; sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, &start_, &end_); /* The Gap Ack Block happens to end at the end of the map. */ if (start_ && !end_) end_ = map->len - 1; /* If we found a Gap Ack Block, return the start and end and * bump the iterator forward. */ if (end_) { /* Fix up the start and end based on the * Cumulative TSN Ack which is always 1 behind base. */ *start = start_ + 1; *end = end_ + 1; /* Move the iterator forward. */ iter->start = map->cumulative_tsn_ack_point + *end + 1; ended = 1; } return ended; } /* Mark this and any lower TSN as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return; /* Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; gap = tsn - map->base_tsn + 1; map->base_tsn += gap; map->cumulative_tsn_ack_point += gap; if (gap >= map->len) { /* If our gap is larger then the map size, just * zero out the map. */ bitmap_zero(map->tsn_map, map->len); } else { /* If the gap is smaller than the map size, * shift the map by 'gap' bits and update further. */ bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len); sctp_tsnmap_update(map); } } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private helper function updates the tsnmap buffers and * the Cumulative TSN Ack Point. */ static void sctp_tsnmap_update(struct sctp_tsnmap *map) { u16 len; unsigned long zero_bit; len = map->max_tsn_seen - map->cumulative_tsn_ack_point; zero_bit = find_first_zero_bit(map->tsn_map, len); if (!zero_bit) return; /* The first 0-bit is bit 0. nothing to do */ map->base_tsn += zero_bit; map->cumulative_tsn_ack_point += zero_bit; bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len); } /* How many data chunks are we missing from our peer? */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map) { __u32 cum_tsn = map->cumulative_tsn_ack_point; __u32 max_tsn = map->max_tsn_seen; __u32 base_tsn = map->base_tsn; __u16 pending_data; u32 gap; pending_data = max_tsn - cum_tsn; gap = max_tsn - base_tsn; if (gap == 0 || gap >= map->len) goto out; pending_data -= bitmap_weight(map->tsn_map, gap + 1); out: return pending_data; } /* This is a private helper for finding Gap Ack Blocks. It searches a * single array for the start and end of a Gap Ack Block. * * The flags "started" and "ended" tell is if we found the beginning * or (respectively) the end of a Gap Ack Block. */ static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end) { int i = off; /* Look through the entire array, but break out * early if we have found the end of the Gap Ack Block. */ /* Also, stop looking past the maximum TSN seen. */ /* Look for the start. */ i = find_next_bit(map, len, off); if (i < len) *start = i; /* Look for the end. */ if (*start) { /* We have found the start, let's find the * end. If we find the end, break out. */ i = find_next_zero_bit(map, len, i); if (i < len) *end = i - 1; } } /* Renege that we have seen a TSN. */ void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; /* Assert: TSN is in range. */ if (!TSN_lt(tsn, map->base_tsn + map->len)) return; gap = tsn - map->base_tsn; /* Pretend we never saw the TSN. */ clear_bit(gap, map->tsn_map); } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs) { struct sctp_tsnmap_iter iter; int ngaps = 0; /* Refresh the gap ack information. */ if (sctp_tsnmap_has_gap(map)) { __u16 start = 0, end = 0; sctp_tsnmap_iter_init(map, &iter); while (sctp_tsnmap_next_gap_ack(map, &iter, &start, &end)) { gabs[ngaps].start = htons(start); gabs[ngaps].end = htons(end); ngaps++; if (ngaps >= SCTP_MAX_GABS) break; } } return ngaps; } static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size) { unsigned long *new; unsigned long inc; u16 len; if (size > SCTP_TSN_MAP_SIZE) return 0; inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE); new = kzalloc(len>>3, GFP_ATOMIC); if (!new) return 0; bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->cumulative_tsn_ack_point); kfree(map->tsn_map); map->tsn_map = new; map->len = len; return 1; } |
2 2 1 1 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2018 Samsung Electronics Co., Ltd. */ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/rwsem.h> #include <linux/mutex.h> #include <linux/wait.h> #include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/genetlink.h> #include <linux/socket.h> #include <linux/workqueue.h> #include "vfs_cache.h" #include "transport_ipc.h" #include "server.h" #include "smb_common.h" #include "mgmt/user_config.h" #include "mgmt/share_config.h" #include "mgmt/user_session.h" #include "mgmt/tree_connect.h" #include "mgmt/ksmbd_ida.h" #include "connection.h" #include "transport_tcp.h" #include "transport_rdma.h" #define IPC_WAIT_TIMEOUT (2 * HZ) #define IPC_MSG_HASH_BITS 3 static DEFINE_HASHTABLE(ipc_msg_table, IPC_MSG_HASH_BITS); static DECLARE_RWSEM(ipc_msg_table_lock); static DEFINE_MUTEX(startup_lock); static DEFINE_IDA(ipc_ida); static unsigned int ksmbd_tools_pid; static bool ksmbd_ipc_validate_version(struct genl_info *m) { if (m->genlhdr->version != KSMBD_GENL_VERSION) { pr_err("%s. ksmbd: %d, kernel module: %d. %s.\n", "Daemon and kernel module version mismatch", m->genlhdr->version, KSMBD_GENL_VERSION, "User-space ksmbd should terminate"); return false; } return true; } struct ksmbd_ipc_msg { unsigned int type; unsigned int sz; unsigned char payload[]; }; struct ipc_msg_table_entry { unsigned int handle; unsigned int type; wait_queue_head_t wait; struct hlist_node ipc_table_hlist; void *response; unsigned int msg_sz; }; static struct delayed_work ipc_timer_work; static int handle_startup_event(struct sk_buff *skb, struct genl_info *info); static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info); static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); static int ksmbd_ipc_heartbeat_request(void); static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = { [KSMBD_EVENT_UNSPEC] = { .len = 0, }, [KSMBD_EVENT_HEARTBEAT_REQUEST] = { .len = sizeof(struct ksmbd_heartbeat), }, [KSMBD_EVENT_STARTING_UP] = { .len = sizeof(struct ksmbd_startup_request), }, [KSMBD_EVENT_SHUTTING_DOWN] = { .len = sizeof(struct ksmbd_shutdown_request), }, [KSMBD_EVENT_LOGIN_REQUEST] = { .len = sizeof(struct ksmbd_login_request), }, [KSMBD_EVENT_LOGIN_RESPONSE] = { .len = sizeof(struct ksmbd_login_response), }, [KSMBD_EVENT_SHARE_CONFIG_REQUEST] = { .len = sizeof(struct ksmbd_share_config_request), }, [KSMBD_EVENT_SHARE_CONFIG_RESPONSE] = { .len = sizeof(struct ksmbd_share_config_response), }, [KSMBD_EVENT_TREE_CONNECT_REQUEST] = { .len = sizeof(struct ksmbd_tree_connect_request), }, [KSMBD_EVENT_TREE_CONNECT_RESPONSE] = { .len = sizeof(struct ksmbd_tree_connect_response), }, [KSMBD_EVENT_TREE_DISCONNECT_REQUEST] = { .len = sizeof(struct ksmbd_tree_disconnect_request), }, [KSMBD_EVENT_LOGOUT_REQUEST] = { .len = sizeof(struct ksmbd_logout_request), }, [KSMBD_EVENT_RPC_REQUEST] = { }, [KSMBD_EVENT_RPC_RESPONSE] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST] = { }, [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { }, [KSMBD_EVENT_LOGIN_REQUEST_EXT] = { .len = sizeof(struct ksmbd_login_request), }, [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = { .len = sizeof(struct ksmbd_login_response_ext), }, }; static struct genl_ops ksmbd_genl_ops[] = { { .cmd = KSMBD_EVENT_UNSPEC, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_HEARTBEAT_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_STARTING_UP, .doit = handle_startup_event, }, { .cmd = KSMBD_EVENT_SHUTTING_DOWN, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_LOGIN_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_LOGIN_RESPONSE, .doit = handle_generic_event, }, { .cmd = KSMBD_EVENT_SHARE_CONFIG_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_SHARE_CONFIG_RESPONSE, .doit = handle_generic_event, }, { .cmd = KSMBD_EVENT_TREE_CONNECT_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_TREE_CONNECT_RESPONSE, .doit = handle_generic_event, }, { .cmd = KSMBD_EVENT_TREE_DISCONNECT_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_LOGOUT_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_RPC_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_RPC_RESPONSE, .doit = handle_generic_event, }, { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, .doit = handle_generic_event, }, { .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT, .doit = handle_unsupported_event, }, { .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT, .doit = handle_generic_event, }, }; static struct genl_family ksmbd_genl_family = { .name = KSMBD_GENL_NAME, .version = KSMBD_GENL_VERSION, .hdrsize = 0, .maxattr = KSMBD_EVENT_MAX, .netnsok = true, .module = THIS_MODULE, .ops = ksmbd_genl_ops, .n_ops = ARRAY_SIZE(ksmbd_genl_ops), .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1, }; static void ksmbd_nl_init_fixup(void) { int i; for (i = 0; i < ARRAY_SIZE(ksmbd_genl_ops); i++) ksmbd_genl_ops[i].validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP; ksmbd_genl_family.policy = ksmbd_nl_policy; } static int rpc_context_flags(struct ksmbd_session *sess) { if (user_guest(sess->user)) return KSMBD_RPC_RESTRICTED_CONTEXT; return 0; } static void ipc_update_last_active(void) { if (server_conf.ipc_timeout) server_conf.ipc_last_active = jiffies; } static struct ksmbd_ipc_msg *ipc_msg_alloc(size_t sz) { struct ksmbd_ipc_msg *msg; size_t msg_sz = sz + sizeof(struct ksmbd_ipc_msg); msg = kvzalloc(msg_sz, KSMBD_DEFAULT_GFP); if (msg) msg->sz = sz; return msg; } static void ipc_msg_free(struct ksmbd_ipc_msg *msg) { kvfree(msg); } static void ipc_msg_handle_free(int handle) { if (handle >= 0) ksmbd_release_id(&ipc_ida, handle); } static int handle_response(int type, void *payload, size_t sz) { unsigned int handle = *(unsigned int *)payload; struct ipc_msg_table_entry *entry; int ret = 0; ipc_update_last_active(); down_read(&ipc_msg_table_lock); hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) { if (handle != entry->handle) continue; entry->response = NULL; /* * Response message type value should be equal to * request message type + 1. */ if (entry->type + 1 != type) { pr_err("Waiting for IPC type %d, got %d. Ignore.\n", entry->type + 1, type); } entry->response = kvzalloc(sz, KSMBD_DEFAULT_GFP); if (!entry->response) { ret = -ENOMEM; break; } memcpy(entry->response, payload, sz); entry->msg_sz = sz; wake_up_interruptible(&entry->wait); ret = 0; break; } up_read(&ipc_msg_table_lock); return ret; } static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) { int ret; ksmbd_set_fd_limit(req->file_max); server_conf.flags = req->flags; server_conf.signing = req->signing; server_conf.tcp_port = req->tcp_port; server_conf.ipc_timeout = req->ipc_timeout * HZ; server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; server_conf.share_fake_fscaps = req->share_fake_fscaps; ksmbd_init_domain(req->sub_auth); if (req->smb2_max_read) init_smb2_max_read_size(req->smb2_max_read); if (req->smb2_max_write) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); if (req->smb2_max_credits) { init_smb2_max_credits(req->smb2_max_credits); server_conf.max_inflight_req = req->smb2_max_credits; } if (req->smbd_max_io_size) init_smbd_max_io_size(req->smbd_max_io_size); if (req->max_connections) server_conf.max_connections = req->max_connections; ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req), req->ifc_list_sz); if (ret) { pr_err("Server configuration error: %s %s %s\n", req->netbios_name, req->server_string, req->work_group); return ret; } if (req->min_prot[0]) { ret = ksmbd_lookup_protocol_idx(req->min_prot); if (ret >= 0) server_conf.min_protocol = ret; } if (req->max_prot[0]) { ret = ksmbd_lookup_protocol_idx(req->max_prot); if (ret >= 0) server_conf.max_protocol = ret; } if (server_conf.ipc_timeout) schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout); return 0; } static int handle_startup_event(struct sk_buff *skb, struct genl_info *info) { int ret = 0; #ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; #endif if (!ksmbd_ipc_validate_version(info)) return -EINVAL; if (!info->attrs[KSMBD_EVENT_STARTING_UP]) return -EINVAL; mutex_lock(&startup_lock); if (!ksmbd_server_configurable()) { mutex_unlock(&startup_lock); pr_err("Server reset is in progress, can't start daemon\n"); return -EINVAL; } if (ksmbd_tools_pid) { if (ksmbd_ipc_heartbeat_request() == 0) { ret = -EINVAL; goto out; } pr_err("Reconnect to a new user space daemon\n"); } else { struct ksmbd_startup_request *req; req = nla_data(info->attrs[info->genlhdr->cmd]); ret = ipc_server_config_on_startup(req); if (ret) goto out; server_queue_ctrl_init_work(); } ksmbd_tools_pid = info->snd_portid; ipc_update_last_active(); out: mutex_unlock(&startup_lock); return ret; } static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) { pr_err("Unknown IPC event: %d, ignore.\n", info->genlhdr->cmd); return -EINVAL; } static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) { void *payload; int sz; int type = info->genlhdr->cmd; #ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; #endif if (type > KSMBD_EVENT_MAX) { WARN_ON(1); return -EINVAL; } if (!ksmbd_ipc_validate_version(info)) return -EINVAL; if (!info->attrs[type]) return -EINVAL; payload = nla_data(info->attrs[info->genlhdr->cmd]); sz = nla_len(info->attrs[info->genlhdr->cmd]); return handle_response(type, payload, sz); } static int ipc_msg_send(struct ksmbd_ipc_msg *msg) { struct genlmsghdr *nlh; struct sk_buff *skb; int ret = -EINVAL; if (!ksmbd_tools_pid) return ret; skb = genlmsg_new(msg->sz, KSMBD_DEFAULT_GFP); if (!skb) return -ENOMEM; nlh = genlmsg_put(skb, 0, 0, &ksmbd_genl_family, 0, msg->type); if (!nlh) goto out; ret = nla_put(skb, msg->type, msg->sz, msg->payload); if (ret) { genlmsg_cancel(skb, nlh); goto out; } genlmsg_end(skb, nlh); ret = genlmsg_unicast(&init_net, skb, ksmbd_tools_pid); if (!ret) ipc_update_last_active(); return ret; out: nlmsg_free(skb); return ret; } static int ipc_validate_msg(struct ipc_msg_table_entry *entry) { unsigned int msg_sz = entry->msg_sz; switch (entry->type) { case KSMBD_EVENT_RPC_REQUEST: { struct ksmbd_rpc_command *resp = entry->response; msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz; break; } case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST: { struct ksmbd_spnego_authen_response *resp = entry->response; msg_sz = sizeof(struct ksmbd_spnego_authen_response) + resp->session_key_len + resp->spnego_blob_len; break; } case KSMBD_EVENT_SHARE_CONFIG_REQUEST: { struct ksmbd_share_config_response *resp = entry->response; if (resp->payload_sz) { if (resp->payload_sz < resp->veto_list_sz) return -EINVAL; msg_sz = sizeof(struct ksmbd_share_config_response) + resp->payload_sz; } break; } case KSMBD_EVENT_LOGIN_REQUEST_EXT: { struct ksmbd_login_response_ext *resp = entry->response; if (resp->ngroups) { msg_sz = sizeof(struct ksmbd_login_response_ext) + resp->ngroups * sizeof(gid_t); } } } return entry->msg_sz != msg_sz ? -EINVAL : 0; } static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle) { struct ipc_msg_table_entry entry; int ret; if ((int)handle < 0) return NULL; entry.type = msg->type; entry.response = NULL; init_waitqueue_head(&entry.wait); down_write(&ipc_msg_table_lock); entry.handle = handle; hash_add(ipc_msg_table, &entry.ipc_table_hlist, entry.handle); up_write(&ipc_msg_table_lock); ret = ipc_msg_send(msg); if (ret) goto out; ret = wait_event_interruptible_timeout(entry.wait, entry.response != NULL, IPC_WAIT_TIMEOUT); if (entry.response) { ret = ipc_validate_msg(&entry); if (ret) { kvfree(entry.response); entry.response = NULL; } } out: down_write(&ipc_msg_table_lock); hash_del(&entry.ipc_table_hlist); up_write(&ipc_msg_table_lock); return entry.response; } static int ksmbd_ipc_heartbeat_request(void) { struct ksmbd_ipc_msg *msg; int ret; msg = ipc_msg_alloc(sizeof(struct ksmbd_heartbeat)); if (!msg) return -EINVAL; msg->type = KSMBD_EVENT_HEARTBEAT_REQUEST; ret = ipc_msg_send(msg); ipc_msg_free(msg); return ret; } struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) { struct ksmbd_ipc_msg *msg; struct ksmbd_login_request *req; struct ksmbd_login_response *resp; if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) return NULL; msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); if (!msg) return NULL; msg->type = KSMBD_EVENT_LOGIN_REQUEST; req = (struct ksmbd_login_request *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account) { struct ksmbd_ipc_msg *msg; struct ksmbd_login_request *req; struct ksmbd_login_response_ext *resp; if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) return NULL; msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); if (!msg) return NULL; msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT; req = (struct ksmbd_login_request *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_spnego_authen_response * ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) { struct ksmbd_ipc_msg *msg; struct ksmbd_spnego_authen_request *req; struct ksmbd_spnego_authen_response *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_spnego_authen_request) + blob_len + 1); if (!msg) return NULL; msg->type = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST; req = (struct ksmbd_spnego_authen_request *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); req->spnego_blob_len = blob_len; memcpy(req->spnego_blob, spnego_blob, blob_len); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_tree_connect_response * ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, struct ksmbd_share_config *share, struct ksmbd_tree_connect *tree_conn, struct sockaddr *peer_addr) { struct ksmbd_ipc_msg *msg; struct ksmbd_tree_connect_request *req; struct ksmbd_tree_connect_response *resp; if (strlen(user_name(sess->user)) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) return NULL; if (strlen(share->name) >= KSMBD_REQ_MAX_SHARE_NAME) return NULL; msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_connect_request)); if (!msg) return NULL; msg->type = KSMBD_EVENT_TREE_CONNECT_REQUEST; req = (struct ksmbd_tree_connect_request *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); req->account_flags = sess->user->flags; req->session_id = sess->id; req->connect_id = tree_conn->id; strscpy(req->account, user_name(sess->user), KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); strscpy(req->share, share->name, KSMBD_REQ_MAX_SHARE_NAME); snprintf(req->peer_addr, sizeof(req->peer_addr), "%pIS", peer_addr); if (peer_addr->sa_family == AF_INET6) req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_IPV6; if (test_session_flag(sess, CIFDS_SESSION_FLAG_SMB2)) req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_SMB2; resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, unsigned long long connect_id) { struct ksmbd_ipc_msg *msg; struct ksmbd_tree_disconnect_request *req; int ret; msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_disconnect_request)); if (!msg) return -ENOMEM; msg->type = KSMBD_EVENT_TREE_DISCONNECT_REQUEST; req = (struct ksmbd_tree_disconnect_request *)msg->payload; req->session_id = session_id; req->connect_id = connect_id; ret = ipc_msg_send(msg); ipc_msg_free(msg); return ret; } int ksmbd_ipc_logout_request(const char *account, int flags) { struct ksmbd_ipc_msg *msg; struct ksmbd_logout_request *req; int ret; if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) return -EINVAL; msg = ipc_msg_alloc(sizeof(struct ksmbd_logout_request)); if (!msg) return -ENOMEM; msg->type = KSMBD_EVENT_LOGOUT_REQUEST; req = (struct ksmbd_logout_request *)msg->payload; req->account_flags = flags; strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); ret = ipc_msg_send(msg); ipc_msg_free(msg); return ret; } struct ksmbd_share_config_response * ksmbd_ipc_share_config_request(const char *name) { struct ksmbd_ipc_msg *msg; struct ksmbd_share_config_request *req; struct ksmbd_share_config_response *resp; if (strlen(name) >= KSMBD_REQ_MAX_SHARE_NAME) return NULL; msg = ipc_msg_alloc(sizeof(struct ksmbd_share_config_request)); if (!msg) return NULL; msg->type = KSMBD_EVENT_SHARE_CONFIG_REQUEST; req = (struct ksmbd_share_config_request *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); strscpy(req->share_name, name, KSMBD_REQ_MAX_SHARE_NAME); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_open(struct ksmbd_session *sess, int handle) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; req->flags = ksmbd_session_rpc_method(sess, handle); req->flags |= KSMBD_RPC_OPEN_METHOD; req->payload_sz = 0; resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_close(struct ksmbd_session *sess, int handle) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; req->flags = ksmbd_session_rpc_method(sess, handle); req->flags |= KSMBD_RPC_CLOSE_METHOD; req->payload_sz = 0; resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle, void *payload, size_t payload_sz) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; req->flags = ksmbd_session_rpc_method(sess, handle); req->flags |= rpc_context_flags(sess); req->flags |= KSMBD_RPC_WRITE_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; req->flags = ksmbd_session_rpc_method(sess, handle); req->flags |= rpc_context_flags(sess); req->flags |= KSMBD_RPC_READ_METHOD; req->payload_sz = 0; resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle, void *payload, size_t payload_sz) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; req->flags = ksmbd_session_rpc_method(sess, handle); req->flags |= rpc_context_flags(sess); req->flags |= KSMBD_RPC_IOCTL_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); return resp; } struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload, size_t payload_sz) { struct ksmbd_ipc_msg *msg; struct ksmbd_rpc_command *req; struct ksmbd_rpc_command *resp; msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); if (!msg) return NULL; msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = ksmbd_acquire_id(&ipc_ida); req->flags = rpc_context_flags(sess); req->flags |= KSMBD_RPC_RAP_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_handle_free(req->handle); ipc_msg_free(msg); return resp; } static int __ipc_heartbeat(void) { unsigned long delta; if (!ksmbd_server_running()) return 0; if (time_after(jiffies, server_conf.ipc_last_active)) { delta = (jiffies - server_conf.ipc_last_active); } else { ipc_update_last_active(); schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout); return 0; } if (delta < server_conf.ipc_timeout) { schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout - delta); return 0; } if (ksmbd_ipc_heartbeat_request() == 0) { schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout); return 0; } mutex_lock(&startup_lock); WRITE_ONCE(server_conf.state, SERVER_STATE_RESETTING); server_conf.ipc_last_active = 0; ksmbd_tools_pid = 0; pr_err("No IPC daemon response for %lus\n", delta / HZ); mutex_unlock(&startup_lock); return -EINVAL; } static void ipc_timer_heartbeat(struct work_struct *w) { if (__ipc_heartbeat()) server_queue_ctrl_reset_work(); } int ksmbd_ipc_id_alloc(void) { return ksmbd_acquire_id(&ipc_ida); } void ksmbd_rpc_id_free(int handle) { ksmbd_release_id(&ipc_ida, handle); } void ksmbd_ipc_release(void) { cancel_delayed_work_sync(&ipc_timer_work); genl_unregister_family(&ksmbd_genl_family); } void ksmbd_ipc_soft_reset(void) { mutex_lock(&startup_lock); ksmbd_tools_pid = 0; cancel_delayed_work_sync(&ipc_timer_work); mutex_unlock(&startup_lock); } int ksmbd_ipc_init(void) { int ret = 0; ksmbd_nl_init_fixup(); INIT_DELAYED_WORK(&ipc_timer_work, ipc_timer_heartbeat); ret = genl_register_family(&ksmbd_genl_family); if (ret) { pr_err("Failed to register KSMBD netlink interface %d\n", ret); cancel_delayed_work_sync(&ipc_timer_work); } return ret; } |
2 2 2 2 34 32 6 6 6 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 32 31 31 31 31 31 31 31 31 31 31 31 2 34 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org> * Copyright (C) 2019-2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/debugfs.h> #include <linux/random.h> #include <linux/moduleparam.h> #include <linux/ieee80211.h> #include <linux/minmax.h> #include <net/mac80211.h> #include "rate.h" #include "sta_info.h" #include "rc80211_minstrel_ht.h" #define AVG_AMPDU_SIZE 16 #define AVG_PKT_SIZE 1200 /* Number of bits for an average sized packet */ #define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3) /* Number of symbols for a packet with (bps) bits per symbol */ #define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps)) /* Transmission time (nanoseconds) for a packet containing (syms) symbols */ #define MCS_SYMBOL_TIME(sgi, syms) \ (sgi ? \ ((syms) * 18000 + 4000) / 5 : /* syms * 3.6 us */ \ ((syms) * 1000) << 2 /* syms * 4 us */ \ ) /* Transmit duration for the raw data part of an average sized packet */ #define MCS_DURATION(streams, sgi, bps) \ (MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) / AVG_AMPDU_SIZE) #define BW_20 0 #define BW_40 1 #define BW_80 2 /* * Define group sort order: HT40 -> SGI -> #streams */ #define GROUP_IDX(_streams, _sgi, _ht40) \ MINSTREL_HT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * _ht40 + \ MINSTREL_MAX_STREAMS * _sgi + \ _streams - 1 #define _MAX(a, b) (((a)>(b))?(a):(b)) #define GROUP_SHIFT(duration) \ _MAX(0, 16 - __builtin_clz(duration)) /* MCS rate information for an MCS group */ #define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _ht40, \ .flags = \ IEEE80211_TX_RC_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \ } \ } #define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) #define MCS_GROUP(_streams, _sgi, _ht40) \ __MCS_GROUP(_streams, _sgi, _ht40, \ MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) #define VHT_GROUP_IDX(_streams, _sgi, _bw) \ (MINSTREL_VHT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * (_bw) + \ MINSTREL_MAX_STREAMS * (_sgi) + \ (_streams) - 1) #define BW2VBPS(_bw, r3, r2, r1) \ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) #define __VHT_GROUP(_streams, _sgi, _bw, _s) \ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _bw, \ .flags = \ IEEE80211_TX_RC_VHT_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_bw == BW_80 ? IEEE80211_TX_RC_80_MHZ_WIDTH : \ _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 234, 108, 52)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 351, 162, 78)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 468, 216, 104)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 702, 324, 156)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 936, 432, 208)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1053, 486, 234)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1170, 540, 260)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1404, 648, 312)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1560, 720, 346)) >> _s \ } \ } #define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26))) #define VHT_GROUP(_streams, _sgi, _bw) \ __VHT_GROUP(_streams, _sgi, _bw, \ VHT_GROUP_SHIFT(_streams, _sgi, _bw)) #define CCK_DURATION(_bitrate, _short) \ (1000 * (10 /* SIFS */ + \ (_short ? 72 + 24 : 144 + 48) + \ (8 * (AVG_PKT_SIZE + 4) * 10) / (_bitrate))) #define CCK_DURATION_LIST(_short, _s) \ CCK_DURATION(10, _short) >> _s, \ CCK_DURATION(20, _short) >> _s, \ CCK_DURATION(55, _short) >> _s, \ CCK_DURATION(110, _short) >> _s #define __CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ CCK_DURATION_LIST(false, _s), \ CCK_DURATION_LIST(true, _s) \ } \ } #define CCK_GROUP_SHIFT \ GROUP_SHIFT(CCK_DURATION(10, false)) #define CCK_GROUP __CCK_GROUP(CCK_GROUP_SHIFT) #define OFDM_DURATION(_bitrate) \ (1000 * (16 /* SIFS + signal ext */ + \ 16 /* T_PREAMBLE */ + \ 4 /* T_SIGNAL */ + \ 4 * (((16 + 80 * (AVG_PKT_SIZE + 4) + 6) / \ ((_bitrate) * 4))))) #define OFDM_DURATION_LIST(_s) \ OFDM_DURATION(60) >> _s, \ OFDM_DURATION(90) >> _s, \ OFDM_DURATION(120) >> _s, \ OFDM_DURATION(180) >> _s, \ OFDM_DURATION(240) >> _s, \ OFDM_DURATION(360) >> _s, \ OFDM_DURATION(480) >> _s, \ OFDM_DURATION(540) >> _s #define __OFDM_GROUP(_s) \ [MINSTREL_OFDM_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ OFDM_DURATION_LIST(_s), \ } \ } #define OFDM_GROUP_SHIFT \ GROUP_SHIFT(OFDM_DURATION(60)) #define OFDM_GROUP __OFDM_GROUP(OFDM_GROUP_SHIFT) static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); MODULE_PARM_DESC(minstrel_vht_only, "Use only VHT rates when VHT is supported by sta."); /* * To enable sufficiently targeted rate sampling, MCS rates are divided into * groups, based on the number of streams and flags (HT40, SGI) that they * use. * * Sortorder has to be fixed for GROUP_IDX macro to be applicable: * BW -> SGI -> #streams */ const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(1, 0, BW_20), MCS_GROUP(2, 0, BW_20), MCS_GROUP(3, 0, BW_20), MCS_GROUP(4, 0, BW_20), MCS_GROUP(1, 1, BW_20), MCS_GROUP(2, 1, BW_20), MCS_GROUP(3, 1, BW_20), MCS_GROUP(4, 1, BW_20), MCS_GROUP(1, 0, BW_40), MCS_GROUP(2, 0, BW_40), MCS_GROUP(3, 0, BW_40), MCS_GROUP(4, 0, BW_40), MCS_GROUP(1, 1, BW_40), MCS_GROUP(2, 1, BW_40), MCS_GROUP(3, 1, BW_40), MCS_GROUP(4, 1, BW_40), CCK_GROUP, OFDM_GROUP, VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), VHT_GROUP(3, 0, BW_20), VHT_GROUP(4, 0, BW_20), VHT_GROUP(1, 1, BW_20), VHT_GROUP(2, 1, BW_20), VHT_GROUP(3, 1, BW_20), VHT_GROUP(4, 1, BW_20), VHT_GROUP(1, 0, BW_40), VHT_GROUP(2, 0, BW_40), VHT_GROUP(3, 0, BW_40), VHT_GROUP(4, 0, BW_40), VHT_GROUP(1, 1, BW_40), VHT_GROUP(2, 1, BW_40), VHT_GROUP(3, 1, BW_40), VHT_GROUP(4, 1, BW_40), VHT_GROUP(1, 0, BW_80), VHT_GROUP(2, 0, BW_80), VHT_GROUP(3, 0, BW_80), VHT_GROUP(4, 0, BW_80), VHT_GROUP(1, 1, BW_80), VHT_GROUP(2, 1, BW_80), VHT_GROUP(3, 1, BW_80), VHT_GROUP(4, 1, BW_80), }; const s16 minstrel_cck_bitrates[4] = { 10, 20, 55, 110 }; const s16 minstrel_ofdm_bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; static const u8 minstrel_sample_seq[] = { MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_SLOW, }; static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); /* * Some VHT MCSes are invalid (when Ndbps / Nes is not an integer) * e.g for MCS9@20MHzx1Nss: Ndbps=8x52*(5/6) Nes=1 * * Returns the valid mcs map for struct minstrel_mcs_group_data.supported */ static u16 minstrel_get_valid_vht_rates(int bw, int nss, __le16 mcs_map) { u16 mask = 0; if (bw == BW_20) { if (nss != 3 && nss != 6) mask = BIT(9); } else if (bw == BW_80) { if (nss == 3 || nss == 7) mask = BIT(6); else if (nss == 6) mask = BIT(9); } else { WARN_ON(bw != BW_40); } switch ((le16_to_cpu(mcs_map) >> (2 * (nss - 1))) & 3) { case IEEE80211_VHT_MCS_SUPPORT_0_7: mask |= 0x300; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: mask |= 0x200; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: break; default: mask = 0x3ff; } return 0x3ff & ~mask; } static bool minstrel_ht_is_legacy_group(int group) { return group == MINSTREL_CCK_GROUP || group == MINSTREL_OFDM_GROUP; } /* * Look up an MCS group index based on mac80211 rate information */ static int minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) { return GROUP_IDX((rate->idx / 8) + 1, !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } /* * Look up an MCS group index based on new cfg80211 rate_info. */ static int minstrel_ht_ri_get_group_idx(struct rate_info *rate) { return GROUP_IDX((rate->mcs / 8) + 1, !!(rate->flags & RATE_INFO_FLAGS_SHORT_GI), !!(rate->bw & RATE_INFO_BW_40)); } static int minstrel_vht_get_group_idx(struct ieee80211_tx_rate *rate) { return VHT_GROUP_IDX(ieee80211_rate_get_vht_nss(rate), !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + 2*!!(rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)); } /* * Look up an MCS group index based on new cfg80211 rate_info. */ static int minstrel_vht_ri_get_group_idx(struct rate_info *rate) { return VHT_GROUP_IDX(rate->nss, !!(rate->flags & RATE_INFO_FLAGS_SHORT_GI), !!(rate->bw & RATE_INFO_BW_40) + 2*!!(rate->bw & RATE_INFO_BW_80)); } static struct minstrel_rate_stats * minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int group, idx; if (rate->flags & IEEE80211_TX_RC_MCS) { group = minstrel_ht_get_group_idx(rate); idx = rate->idx % 8; goto out; } if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { group = minstrel_vht_get_group_idx(rate); idx = ieee80211_rate_get_vht_mcs(rate); goto out; } group = MINSTREL_CCK_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { if (!(mi->supported[group] & BIT(idx))) continue; if (rate->idx != mp->cck_rates[idx]) continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) idx += 4; goto out; } group = MINSTREL_OFDM_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) if (rate->idx == mp->ofdm_rates[mi->band][idx]) goto out; idx = 0; out: return &mi->groups[group].rates[idx]; } /* * Get the minstrel rate statistics for specified STA and rate info. */ static struct minstrel_rate_stats * minstrel_ht_ri_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_rate_status *rate_status) { int group, idx; struct rate_info *rate = &rate_status->rate_idx; if (rate->flags & RATE_INFO_FLAGS_MCS) { group = minstrel_ht_ri_get_group_idx(rate); idx = rate->mcs % 8; goto out; } if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) { group = minstrel_vht_ri_get_group_idx(rate); idx = rate->mcs; goto out; } group = MINSTREL_CCK_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { if (rate->legacy != minstrel_cck_bitrates[ mp->cck_rates[idx] ]) continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && mi->use_short_preamble) idx += 4; goto out; } group = MINSTREL_OFDM_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) if (rate->legacy == minstrel_ofdm_bitrates[ mp->ofdm_rates[mi->band][idx] ]) goto out; idx = 0; out: return &mi->groups[group].rates[idx]; } static inline struct minstrel_rate_stats * minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) { return &mi->groups[MI_RATE_GROUP(index)].rates[MI_RATE_IDX(index)]; } static inline int minstrel_get_duration(int index) { const struct mcs_group *group = &minstrel_mcs_groups[MI_RATE_GROUP(index)]; unsigned int duration = group->duration[MI_RATE_IDX(index)]; return duration << group->shift; } static unsigned int minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) { int duration; if (mi->avg_ampdu_len) return MINSTREL_TRUNC(mi->avg_ampdu_len); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_tp_rate[0]))) return 1; duration = minstrel_get_duration(mi->max_tp_rate[0]); if (duration > 400 * 1000) return 2; if (duration > 250 * 1000) return 4; if (duration > 150 * 1000) return 8; return 16; } /* * Return current throughput based on the average A-MPDU length, taking into * account the expected number of retransmissions and their expected length */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_avg) { unsigned int nsecs = 0, overhead = mi->overhead; unsigned int ampdu_len = 1; /* do not account throughput if success prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (minstrel_ht_is_legacy_group(group)) overhead = mi->overhead_legacy; else ampdu_len = minstrel_ht_avg_ampdu_len(mi); nsecs = 1000 * overhead / ampdu_len; nsecs += minstrel_mcs_groups[group].duration[rate] << minstrel_mcs_groups[group].shift; /* * For the throughput calculation, limit the probability value to 90% to * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ if (prob_avg > MINSTREL_FRAC(90, 100)) prob_avg = MINSTREL_FRAC(90, 100); return MINSTREL_TRUNC(100 * ((prob_avg * 1000000) / nsecs)); } /* * Find & sort topmost throughput rates * * If multiple rates provide equal throughput the sorting is based on their * current success probability. Higher success probability is preferred among * MCS groups, CCK rates do not provide aggregation and are therefore at last. */ static void minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, u16 *tp_list) { int cur_group, cur_idx, cur_tp_avg, cur_prob; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int j = MAX_THR_RATES; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = MI_RATE_GROUP(tp_list[j - 1]); tmp_idx = MI_RATE_IDX(tp_list[j - 1]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || (cur_tp_avg == tmp_tp_avg && cur_prob <= tmp_prob)) break; j--; } while (j > 0); if (j < MAX_THR_RATES - 1) { memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * (MAX_THR_RATES - (j + 1)))); } if (j < MAX_THR_RATES) tp_list[j] = index; } /* * Find and set the topmost probability rate per sta and per group */ static void minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 *dest, u16 index) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int max_tp_group, max_tp_idx, max_tp_prob; int cur_tp_avg, cur_group, cur_idx; int max_gpr_group, max_gpr_idx; int max_gpr_tp_avg, max_gpr_prob; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); mg = &mi->groups[cur_group]; mrs = &mg->rates[cur_idx]; tmp_group = MI_RATE_GROUP(*dest); tmp_idx = MI_RATE_IDX(*dest); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ max_tp_group = MI_RATE_GROUP(mi->max_tp_rate[0]); max_tp_idx = MI_RATE_IDX(mi->max_tp_rate[0]); max_tp_prob = mi->groups[max_tp_group].rates[max_tp_idx].prob_avg; if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index)) && !minstrel_ht_is_legacy_group(max_tp_group)) return; /* skip rates faster than max tp rate with lower prob */ if (minstrel_get_duration(mi->max_tp_rate[0]) > minstrel_get_duration(index) && mrs->prob_avg < max_tp_prob) return; max_gpr_group = MI_RATE_GROUP(mg->max_group_prob_rate); max_gpr_idx = MI_RATE_IDX(mg->max_group_prob_rate); max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) *dest = index; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, max_gpr_prob); if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { if (mrs->prob_avg > tmp_prob) *dest = index; if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } /* * Assign new rate set per sta and use CCK rates only if the fastest * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted * rate sets where MCS and CCK rates are mixed, because CCK rates can * not use aggregation. */ static void minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, u16 tmp_mcs_tp_rate[MAX_THR_RATES], u16 tmp_legacy_tp_rate[MAX_THR_RATES]) { unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob; int i; tmp_group = MI_RATE_GROUP(tmp_legacy_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_legacy_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = MI_RATE_GROUP(tmp_mcs_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_mcs_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_legacy_tp_rate[i], tmp_mcs_tp_rate); } } } /* * Try to increase robustness of max_prob rate by decrease number of * streams if possible. */ static inline void minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; if (!mi->sta->deflink.ht_cap.ht_supported) return; group = MI_RATE_GROUP(mi->max_tp_rate[0]); tmp_max_streams = minstrel_mcs_groups[group].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; tmp_idx = MI_RATE_IDX(mg->max_group_prob_rate); tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { mi->max_prob_rate = mg->max_group_prob_rate; tmp_tp = minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob); } } } static u16 __minstrel_ht_get_sample_rate(struct minstrel_ht_sta *mi, enum minstrel_sample_type type) { u16 *rates = mi->sample[type].sample_rates; u16 cur; int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { if (!rates[i]) continue; cur = rates[i]; rates[i] = 0; return cur; } return 0; } static inline int minstrel_ewma(int old, int new, int weight) { int diff, incr; diff = new - old; incr = (EWMA_DIV - weight) * diff / EWMA_DIV; return old + incr; } static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) { s32 out_1 = *prev_1; s32 out_2 = *prev_2; s32 val; if (!in) in += 1; if (!out_1) { val = out_1 = in; goto out; } val = MINSTREL_AVG_COEFF1 * in; val += MINSTREL_AVG_COEFF2 * out_1; val += MINSTREL_AVG_COEFF3 * out_2; val >>= MINSTREL_SCALE; if (val > 1 << MINSTREL_SCALE) val = 1 << MINSTREL_SCALE; if (val < 0) val = 1; out: *prev_2 = out_1; *prev_1 = val; return val; } /* * Recalculate statistics and counters of a given rate */ static void minstrel_ht_calc_rate_stats(struct minstrel_priv *mp, struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); minstrel_filter_avg_add(&mrs->prob_avg, &mrs->prob_avg_1, cur_prob); mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; } mrs->last_success = mrs->success; mrs->last_attempts = mrs->attempts; mrs->success = 0; mrs->attempts = 0; } static bool minstrel_ht_find_sample_rate(struct minstrel_ht_sta *mi, int type, int idx) { int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { u16 cur = mi->sample[type].sample_rates[i]; if (cur == idx) return true; if (!cur) break; } return false; } static int minstrel_ht_move_sample_rates(struct minstrel_ht_sta *mi, int type, u32 fast_rate_dur, u32 slow_rate_dur) { u16 *rates = mi->sample[type].sample_rates; int i, j; for (i = 0, j = 0; i < MINSTREL_SAMPLE_RATES; i++) { u32 duration; bool valid = false; u16 cur; cur = rates[i]; if (!cur) continue; duration = minstrel_get_duration(cur); switch (type) { case MINSTREL_SAMPLE_TYPE_SLOW: valid = duration > fast_rate_dur && duration < slow_rate_dur; break; case MINSTREL_SAMPLE_TYPE_INC: case MINSTREL_SAMPLE_TYPE_JUMP: valid = duration < fast_rate_dur; break; default: valid = false; break; } if (!valid) { rates[i] = 0; continue; } if (i == j) continue; rates[j++] = cur; rates[i] = 0; } return j; } static int minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group, u32 max_duration) { u16 supported = mi->supported[group]; int i; for (i = 0; i < MCS_GROUP_RATES && supported; i++, supported >>= 1) { if (!(supported & BIT(0))) continue; if (minstrel_get_duration(MI_RATE(group, i)) >= max_duration) continue; return i; } return -1; } /* * Incremental update rates: * Flip through groups and pick the first group rate that is faster than the * highest currently selected rate */ static u16 minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) { u8 type = MINSTREL_SAMPLE_TYPE_INC; int i, index = 0; u8 group; group = mi->sample[type].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); index = minstrel_ht_group_min_rate_offset(mi, group, fast_rate_dur); if (index < 0) continue; index = MI_RATE(group, index & 0xf); if (!minstrel_ht_find_sample_rate(mi, type, index)) goto out; } index = 0; out: mi->sample[type].sample_group = group; return index; } static int minstrel_ht_next_group_sample_rate(struct minstrel_ht_sta *mi, int group, u16 supported, int offset) { struct minstrel_mcs_group_data *mg = &mi->groups[group]; u16 idx; int i; for (i = 0; i < MCS_GROUP_RATES; i++) { idx = sample_table[mg->column][mg->index]; if (++mg->index >= MCS_GROUP_RATES) { mg->index = 0; if (++mg->column >= ARRAY_SIZE(sample_table)) mg->column = 0; } if (idx < offset) continue; if (!(supported & BIT(idx))) continue; return MI_RATE(group, idx); } return -1; } /* * Jump rates: * Sample random rates, use those that are faster than the highest * currently selected rate. Rates between the fastest and the slowest * get sorted into the slow sample bucket, but only if it has room */ static u16 minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, u32 slow_rate_dur, int *slow_rate_ofs) { struct minstrel_rate_stats *mrs; u32 max_duration = slow_rate_dur; int i, index, offset; u16 *slow_rates; u16 supported; u32 duration; u8 group; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; slow_rates = mi->sample[MINSTREL_SAMPLE_TYPE_SLOW].sample_rates; group = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { u8 type; group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); supported = mi->supported[group]; if (!supported) continue; offset = minstrel_ht_group_min_rate_offset(mi, group, max_duration); if (offset < 0) continue; index = minstrel_ht_next_group_sample_rate(mi, group, supported, offset); if (index < 0) continue; duration = minstrel_get_duration(index); if (duration < fast_rate_dur) type = MINSTREL_SAMPLE_TYPE_JUMP; else type = MINSTREL_SAMPLE_TYPE_SLOW; if (minstrel_ht_find_sample_rate(mi, type, index)) continue; if (type == MINSTREL_SAMPLE_TYPE_JUMP) goto found; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) continue; if (duration >= slow_rate_dur) continue; /* skip slow rates with high success probability */ mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg > MINSTREL_FRAC(95, 100)) continue; slow_rates[(*slow_rate_ofs)++] = index; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; } index = 0; found: mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group = group; return index; } static void minstrel_ht_refill_sample_rates(struct minstrel_ht_sta *mi) { u32 prob_dur = minstrel_get_duration(mi->max_prob_rate); u32 tp_dur = minstrel_get_duration(mi->max_tp_rate[0]); u32 tp2_dur = minstrel_get_duration(mi->max_tp_rate[1]); u32 fast_rate_dur = min(min(tp_dur, tp2_dur), prob_dur); u32 slow_rate_dur = max(max(tp_dur, tp2_dur), prob_dur); u16 *rates; int i, j; rates = mi->sample[MINSTREL_SAMPLE_TYPE_INC].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_INC, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_inc_rate(mi, tp_dur); if (!rates[i]) break; i++; } rates = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_JUMP, fast_rate_dur, slow_rate_dur); j = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_SLOW, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_jump_rate(mi, fast_rate_dur, slow_rate_dur, &j); if (!rates[i]) break; i++; } for (i = 0; i < ARRAY_SIZE(mi->sample); i++) memcpy(mi->sample[i].cur_sample_rates, mi->sample[i].sample_rates, sizeof(mi->sample[i].cur_sample_rates)); } /* * Update rate statistics and select new primary rates * * Rules for rate selection: * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations * - as long as the max prob rate has a probability of more than 75%, pick * higher throughput rates, even if the probability is a bit lower */ static void minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int group, i, j, cur_prob; u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_legacy_tp_rate[MAX_THR_RATES], tmp_max_prob_rate; u16 index; bool ht_supported = mi->sta->deflink.ht_cap.ht_supported; if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL); else mi->avg_ampdu_len = 0; mi->ampdu_len = 0; mi->ampdu_packets = 0; } if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else if (mi->supported[MINSTREL_OFDM_GROUP]) group = MINSTREL_OFDM_GROUP; else group = 0; index = MI_RATE(group, 0); for (j = 0; j < ARRAY_SIZE(tmp_legacy_tp_rate); j++) tmp_legacy_tp_rate[j] = index; if (mi->supported[MINSTREL_VHT_GROUP_0]) group = MINSTREL_VHT_GROUP_0; else if (ht_supported) group = MINSTREL_HT_GROUP_0; else if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else group = MINSTREL_OFDM_GROUP; index = MI_RATE(group, 0); tmp_max_prob_rate = index; for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++) tmp_mcs_tp_rate[j] = index; /* Find best rate sets within all MCS groups*/ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { u16 *tp_rate = tmp_mcs_tp_rate; u16 last_prob = 0; mg = &mi->groups[group]; if (!mi->supported[group]) continue; /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) tmp_group_tp_rate[j] = MI_RATE(group, 0); if (group == MINSTREL_CCK_GROUP && ht_supported) tp_rate = tmp_legacy_tp_rate; for (i = MCS_GROUP_RATES - 1; i >= 0; i--) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); mrs = &mg->rates[i]; mrs->retry_updated = false; minstrel_ht_calc_rate_stats(mp, mrs); if (mrs->att_hist) last_prob = max(last_prob, mrs->prob_avg); else mrs->prob_avg = max(last_prob, mrs->prob_avg); cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; /* Find max throughput rate set */ minstrel_ht_sort_best_tp_rates(mi, index, tp_rate); /* Find max throughput rate set within a group */ minstrel_ht_sort_best_tp_rates(mi, index, tmp_group_tp_rate); } memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, sizeof(mg->max_group_tp_rate)); } /* Assign new rate set per sta */ minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_legacy_tp_rate); memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { if (!mi->supported[group]) continue; mg = &mi->groups[group]; mg->max_group_prob_rate = MI_RATE(group, 0); for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); /* Find max probability rate per group and global */ minstrel_ht_set_best_prob_rate(mi, &tmp_max_prob_rate, index); } } mi->max_prob_rate = tmp_max_prob_rate; /* Try to increase robustness of max_prob_rate*/ minstrel_ht_prob_rate_reduce_streams(mi); minstrel_ht_refill_sample_rates(mi); #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ if (mp->fixed_rate_idx != -1) { for (i = 0; i < 4; i++) mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; } #endif /* Reset update timer */ mi->last_stats_update = jiffies; mi->sample_time = jiffies; } static bool minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int i; if (rate->idx < 0) return false; if (!rate->count) return false; if (rate->flags & IEEE80211_TX_RC_MCS || rate->flags & IEEE80211_TX_RC_VHT_MCS) return true; for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) if (rate->idx == mp->cck_rates[i]) return true; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) if (rate->idx == mp->ofdm_rates[mi->band][i]) return true; return false; } /* * Check whether rate_status contains valid information. */ static bool minstrel_ht_ri_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_rate_status *rate_status) { int i; if (!rate_status) return false; if (!rate_status->try_count) return false; if (rate_status->rate_idx.flags & RATE_INFO_FLAGS_MCS || rate_status->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) return true; for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) { if (rate_status->rate_idx.legacy == minstrel_cck_bitrates[ mp->cck_rates[i] ]) return true; } for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates); i++) { if (rate_status->rate_idx.legacy == minstrel_ofdm_bitrates[ mp->ofdm_rates[mi->band][i] ]) return true; } return false; } static void minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) { int group, orig_group; orig_group = group = MI_RATE_GROUP(*idx); while (group > 0) { group--; if (!mi->supported[group]) continue; if (minstrel_mcs_groups[group].streams > minstrel_mcs_groups[orig_group].streams) continue; if (primary) *idx = mi->groups[group].max_group_tp_rate[0]; else *idx = mi->groups[group].max_group_tp_rate[1]; break; } } static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) { struct ieee80211_tx_info *info = st->info; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2; struct minstrel_priv *mp = priv; u32 update_interval = mp->update_interval; bool last, update = false; int i; /* Ignore packet that was sent with noAck flag */ if (info->flags & IEEE80211_TX_CTL_NO_ACK) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) { info->status.ampdu_ack_len = (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0); info->status.ampdu_len = 1; } /* wraparound */ if (mi->total_packets >= ~0 - info->status.ampdu_len) { mi->total_packets = 0; mi->sample_packets = 0; } mi->total_packets += info->status.ampdu_len; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; mi->ampdu_packets++; mi->ampdu_len += info->status.ampdu_len; if (st->rates && st->n_rates) { last = !minstrel_ht_ri_txstat_valid(mp, mi, &(st->rates[0])); for (i = 0; !last; i++) { last = (i == st->n_rates - 1) || !minstrel_ht_ri_txstat_valid(mp, mi, &(st->rates[i + 1])); rate = minstrel_ht_ri_get_stats(mp, mi, &(st->rates[i])); if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += st->rates[i].try_count * info->status.ampdu_len; } } else { last = !minstrel_ht_txstat_valid(mp, mi, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || !minstrel_ht_txstat_valid(mp, mi, &ar[i + 1]); rate = minstrel_ht_get_stats(mp, mi, &ar[i]); if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += ar[i].count * info->status.ampdu_len; } } if (mp->hw->max_rates > 1) { /* * check for sudden death of spatial multiplexing, * downgrade to a lower number of streams if necessary. */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi); } if (update) minstrel_ht_update_rates(mp, mi); } static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) { struct minstrel_rate_stats *mrs; unsigned int tx_time, tx_time_rtscts, tx_time_data; unsigned int cw = mp->cw_min; unsigned int ctime = 0; unsigned int t_slot = 9; /* FIXME */ unsigned int ampdu_len = minstrel_ht_avg_ampdu_len(mi); unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; } mrs->retry_count = 2; mrs->retry_count_rtscts = 2; mrs->retry_updated = true; tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000; /* Contention time for first 2 tries */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); ctime += (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index))) { overhead = mi->overhead_legacy; overhead_rtscts = mi->overhead_legacy_rtscts; } else { overhead = mi->overhead; overhead_rtscts = mi->overhead_rtscts; } /* Total TX time for data and Contention after first 2 tries */ tx_time = ctime + 2 * (overhead + tx_time_data); tx_time_rtscts = ctime + 2 * (overhead_rtscts + tx_time_data); /* See how many more tries we can fit inside segment size */ do { /* Contention time for this try */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); /* Total TX time after this try */ tx_time += ctime + overhead + tx_time_data; tx_time_rtscts += ctime + overhead_rtscts + tx_time_data; if (tx_time_rtscts < mp->segment_size) mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && (++mrs->retry_count < mp->max_retry)); } static void minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_sta_rates *ratetbl, int offset, int index) { int group_idx = MI_RATE_GROUP(index); const struct mcs_group *group = &minstrel_mcs_groups[group_idx]; struct minstrel_rate_stats *mrs; u8 idx; u16 flags = group->flags; mrs = minstrel_get_ratestats(mi, index); if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; } else { ratetbl->rate[offset].count = mrs->retry_count; ratetbl->rate[offset].count_cts = mrs->retry_count; ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts; } index = MI_RATE_IDX(index); if (group_idx == MINSTREL_CCK_GROUP) idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)]; else if (group_idx == MINSTREL_OFDM_GROUP) idx = mp->ofdm_rates[mi->band][index % ARRAY_SIZE(mp->ofdm_rates[0])]; else if (flags & IEEE80211_TX_RC_VHT_MCS) idx = ((group->streams - 1) << 4) | (index & 0xF); else idx = index + (group->streams - 1) * 8; /* enable RTS/CTS if needed: * - if station is in dynamic SMPS (and streams > 1) * - for fallback rates, to increase chances of getting through */ if (offset > 0 || (mi->sta->deflink.smps_mode == IEEE80211_SMPS_DYNAMIC && group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; flags |= IEEE80211_TX_RC_USE_RTS_CTS; } ratetbl->rate[offset].idx = idx; ratetbl->rate[offset].flags = flags; } static inline int minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = MI_RATE_GROUP(rate); rate = MI_RATE_IDX(rate); return mi->groups[group].rates[rate].prob_avg; } static int minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) { int group = MI_RATE_GROUP(mi->max_prob_rate); const struct mcs_group *g = &minstrel_mcs_groups[group]; int rate = MI_RATE_IDX(mi->max_prob_rate); unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; duration <<= g->shift; /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ if (duration > MCS_DURATION(1, 0, 52)) return 500; /* * If the rate is slower than single-stream MCS4, limit A-MSDU to usual * data packet size */ if (duration > MCS_DURATION(1, 0, 104)) return 1600; /* * If the rate is slower than single-stream MCS7, or if the max throughput * rate success probability is less than 75%, limit A-MSDU to twice the usual * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; /* * HT A-MPDU limits maximum MPDU size under BA agreement to 4095 bytes. * Since aggregation sessions are started/stopped without txq flush, use * the limit here to avoid the complexity of having to de-aggregate * packets in the queue. */ if (!mi->sta->deflink.vht_cap.vht_supported) return IEEE80211_MAX_MPDU_LEN_HT_BA; /* unlimited */ return 0; } static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; int i = 0; int max_rates = min_t(int, mp->hw->max_rates, IEEE80211_TX_RATE_TABLE_SIZE); rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) return; /* Start with max_tp_rate[0] */ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); /* Fill up remaining, keep one entry for max_probe_rate */ for (; i < (max_rates - 1); i++) minstrel_ht_set_rate(mp, mi, rates, i, mi->max_tp_rate[i]); if (i < max_rates) minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); if (i < IEEE80211_TX_RATE_TABLE_SIZE) rates->rate[i].idx = -1; mi->sta->deflink.agg.max_rc_amsdu_len = minstrel_ht_get_max_amsdu_len(mi); ieee80211_sta_recalc_aggregates(mi->sta); rate_control_set_rates(mp->hw, mi->sta, rates); } static u16 minstrel_ht_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { u8 seq; if (mp->hw->max_rates > 1) { seq = mi->sample_seq; mi->sample_seq = (seq + 1) % ARRAY_SIZE(minstrel_sample_seq); seq = minstrel_sample_seq[seq]; } else { seq = MINSTREL_SAMPLE_TYPE_INC; } return __minstrel_ht_get_sample_rate(mi, seq); } static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { const struct mcs_group *sample_group; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_tx_rate *rate = &info->status.rates[0]; struct minstrel_ht_sta *mi = priv_sta; struct minstrel_priv *mp = priv; u16 sample_idx; info->flags |= mi->tx_flags; #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) return; #endif /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) return; if (time_is_after_jiffies(mi->sample_time)) return; mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; sample_idx = minstrel_ht_get_sample_rate(mp, mi); if (!sample_idx) return; sample_group = &minstrel_mcs_groups[MI_RATE_GROUP(sample_idx)]; sample_idx = MI_RATE_IDX(sample_idx); if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] && (sample_idx >= 4) != txrc->short_preamble) return; info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; rate->count = 1; if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); rate->idx = mp->cck_rates[idx]; } else if (sample_group == &minstrel_mcs_groups[MINSTREL_OFDM_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->ofdm_rates[0]); rate->idx = mp->ofdm_rates[mi->band][idx]; } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(rate, MI_RATE_IDX(sample_idx), sample_group->streams); } else { rate->idx = sample_idx + (sample_group->streams - 1) * 8; } rate->flags = sample_group->flags; } static void minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; if (sband->band != NL80211_BAND_2GHZ) return; if (sta->deflink.ht_cap.ht_supported && !ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; for (i = 0; i < 4; i++) { if (mp->cck_rates[i] == 0xff || !rate_supported(sta, sband->band, mp->cck_rates[i])) continue; mi->supported[MINSTREL_CCK_GROUP] |= BIT(i); if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE) mi->supported[MINSTREL_CCK_GROUP] |= BIT(i + 4); } } static void minstrel_ht_update_ofdm(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { const u8 *rates; int i; if (sta->deflink.ht_cap.ht_supported) return; rates = mp->ofdm_rates[sband->band]; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) { if (rates[i] == 0xff || !rate_supported(sta, sband->band, rates[i])) continue; mi->supported[MINSTREL_OFDM_GROUP] |= BIT(i); } } static void minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_priv *mp = priv; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_mcs_info *mcs = &sta->deflink.ht_cap.mcs; u16 ht_cap = sta->deflink.ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->deflink.vht_cap; const struct ieee80211_rate *ctl_rate; struct sta_info *sta_info; bool ldpc, erp; int use_vht; int ack_dur; int stbc; int i; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); if (vht_cap->vht_supported) use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0); else use_vht = 0; memset(mi, 0, sizeof(*mi)); mi->sta = sta; mi->band = sband->band; mi->last_stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1); mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1); mi->overhead += ack_dur; mi->overhead_rtscts = mi->overhead + 2 * ack_dur; ctl_rate = &sband->bitrates[rate_lowest_index(sband, sta)]; erp = ctl_rate->flags & IEEE80211_RATE_ERP_G; ack_dur = ieee80211_frame_duration(sband->band, 10, ctl_rate->bitrate, erp, 1); mi->overhead_legacy = ack_dur; mi->overhead_legacy_rtscts = mi->overhead_legacy + 2 * ack_dur; mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); if (!use_vht) { stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >> IEEE80211_HT_CAP_RX_STBC_SHIFT; ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING; } else { stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >> IEEE80211_VHT_CAP_RXSTBC_SHIFT; ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC; } mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; if (ldpc) mi->tx_flags |= IEEE80211_TX_CTL_LDPC; for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; mi->supported[i] = 0; if (minstrel_ht_is_legacy_group(i)) continue; if (gflags & IEEE80211_TX_RC_SHORT_GI) { if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { if (!(ht_cap & IEEE80211_HT_CAP_SGI_40)) continue; } else { if (!(ht_cap & IEEE80211_HT_CAP_SGI_20)) continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH && sta->deflink.bandwidth < IEEE80211_STA_RX_BW_40) continue; nss = minstrel_mcs_groups[i].streams; /* Mark MCS > 7 as unsupported if STA is in static SMPS mode */ if (sta->deflink.smps_mode == IEEE80211_SMPS_STATIC && nss > 1) continue; /* HT rate */ if (gflags & IEEE80211_TX_RC_MCS) { if (use_vht && minstrel_vht_only) continue; mi->supported[i] = mcs->rx_mask[nss - 1]; continue; } /* VHT rate */ if (!vht_cap->vht_supported || WARN_ON(!(gflags & IEEE80211_TX_RC_VHT_MCS)) || WARN_ON(gflags & IEEE80211_TX_RC_160_MHZ_WIDTH)) continue; if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) { if (sta->deflink.bandwidth < IEEE80211_STA_RX_BW_80 || ((gflags & IEEE80211_TX_RC_SHORT_GI) && !(vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80))) { continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) bw = BW_40; else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) bw = BW_80; else bw = BW_20; mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss, vht_cap->vht_mcs.tx_mcs_map); } sta_info = container_of(sta, struct sta_info, sta); mi->use_short_preamble = test_sta_flag(sta_info, WLAN_STA_SHORT_PREAMBLE) && sta_info->sdata->vif.bss_conf.use_short_preamble; minstrel_ht_update_cck(mp, mi, sband, sta); minstrel_ht_update_ofdm(mp, mi, sband, sta); /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); } static void minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void * minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) { struct ieee80211_supported_band *sband; struct minstrel_ht_sta *mi; struct minstrel_priv *mp = priv; struct ieee80211_hw *hw = mp->hw; int max_rates = 0; int i; for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; } return kzalloc(sizeof(*mi), gfp); } static void minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) { kfree(priv_sta); } static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, const s16 *bitrates, int n_rates, u32 rate_flags) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; dest[j] = i; break; } } } static void minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, ARRAY_SIZE(minstrel_cck_bitrates), rate_flags); } static void minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, ARRAY_SIZE(minstrel_ofdm_bitrates), rate_flags); } static void * minstrel_ht_alloc(struct ieee80211_hw *hw) { struct minstrel_priv *mp; int i; mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); if (!mp) return NULL; /* contention window settings * Just an approximation. Using the per-queue values would complicate * the calculations and is probably unnecessary */ mp->cw_min = 15; mp->cw_max = 1023; /* maximum time that the hw is allowed to stay in one MRR segment */ mp->segment_size = 6000; if (hw->max_rate_tries > 0) mp->max_retry = hw->max_rate_tries; else /* safe default, does not necessarily have to match hw properties */ mp->max_retry = 7; mp->hw = hw; mp->update_interval = HZ / 20; minstrel_ht_init_cck_rates(mp); for (i = 0; i < ARRAY_SIZE(mp->hw->wiphy->bands); i++) minstrel_ht_init_ofdm_rates(mp, i); return mp; } #ifdef CONFIG_MAC80211_DEBUGFS static void minstrel_ht_add_debugfs(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir) { struct minstrel_priv *mp = priv; mp->fixed_rate_idx = (u32) -1; debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); } #endif static void minstrel_ht_free(void *priv) { kfree(priv); } static u32 minstrel_ht_get_expected_throughput(void *priv_sta) { struct minstrel_ht_sta *mi = priv_sta; int i, j, prob, tp_avg; i = MI_RATE_GROUP(mi->max_tp_rate[0]); j = MI_RATE_IDX(mi->max_tp_rate[0]); prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", .capa = RATE_CTRL_CAPA_AMPDU_TRIGGER, .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, .rate_update = minstrel_ht_rate_update, .alloc_sta = minstrel_ht_alloc_sta, .free_sta = minstrel_ht_free_sta, .alloc = minstrel_ht_alloc, .free = minstrel_ht_free, #ifdef CONFIG_MAC80211_DEBUGFS .add_debugfs = minstrel_ht_add_debugfs, .add_sta_debugfs = minstrel_ht_add_sta_debugfs, #endif .get_expected_throughput = minstrel_ht_get_expected_throughput, }; static void __init init_sample_table(void) { int col, i, new_idx; u8 rnd[MCS_GROUP_RATES]; memset(sample_table, 0xff, sizeof(sample_table)); for (col = 0; col < SAMPLE_COLUMNS; col++) { get_random_bytes(rnd, sizeof(rnd)); for (i = 0; i < MCS_GROUP_RATES; i++) { new_idx = (i + rnd[i]) % MCS_GROUP_RATES; while (sample_table[col][new_idx] != 0xff) new_idx = (new_idx + 1) % MCS_GROUP_RATES; sample_table[col][new_idx] = i; } } } int __init rc80211_minstrel_init(void) { init_sample_table(); return ieee80211_rate_control_register(&mac80211_minstrel_ht); } void rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel_ht); } |
31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/netfilter.h> #include <net/tcp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_seqadj.h> int nf_ct_seqadj_init(struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_seqadj *seqadj; struct nf_ct_seqadj *this_way; if (off == 0) return 0; set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); seqadj = nfct_seqadj(ct); this_way = &seqadj->seq[dir]; this_way->offset_before = off; this_way->offset_after = off; return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_init); int nf_ct_seqadj_set(struct nf_conn *ct, enum ip_conntrack_info ctinfo, __be32 seq, s32 off) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_ct_seqadj *this_way; if (off == 0) return 0; if (unlikely(!seqadj)) { WARN_ONCE(1, "Missing nfct_seqadj_ext_add() setup call\n"); return 0; } set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); spin_lock_bh(&ct->lock); this_way = &seqadj->seq[dir]; if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, ntohl(seq))) { this_way->correction_pos = ntohl(seq); this_way->offset_before = this_way->offset_after; this_way->offset_after += off; } spin_unlock_bh(&ct->lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_seqadj_set); void nf_ct_tcp_seqadj_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off) { const struct tcphdr *th; if (nf_ct_protonum(ct) != IPPROTO_TCP) return; th = (struct tcphdr *)(skb_network_header(skb) + ip_hdrlen(skb)); nf_ct_seqadj_set(ct, ctinfo, th->seq, off); } EXPORT_SYMBOL_GPL(nf_ct_tcp_seqadj_set); /* Adjust one found SACK option including checksum correction */ static void nf_ct_sack_block_adjust(struct sk_buff *skb, struct tcphdr *tcph, unsigned int sackoff, unsigned int sackend, struct nf_ct_seqadj *seq) { while (sackoff < sackend) { struct tcp_sack_block_wire *sack; __be32 new_start_seq, new_end_seq; sack = (void *)skb->data + sackoff; if (after(ntohl(sack->start_seq) - seq->offset_before, seq->correction_pos)) new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_after); else new_start_seq = htonl(ntohl(sack->start_seq) - seq->offset_before); if (after(ntohl(sack->end_seq) - seq->offset_before, seq->correction_pos)) new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_after); else new_end_seq = htonl(ntohl(sack->end_seq) - seq->offset_before); pr_debug("sack_adjust: start_seq: %u->%u, end_seq: %u->%u\n", ntohl(sack->start_seq), ntohl(new_start_seq), ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); } } /* TCP SACK sequence number adjustment */ static unsigned int nf_ct_sack_adjust(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { struct tcphdr *tcph = (void *)skb->data + protoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); unsigned int dir, optoff, optend; optoff = protoff + sizeof(struct tcphdr); optend = protoff + tcph->doff * 4; if (skb_ensure_writable(skb, optend)) return 0; tcph = (void *)skb->data + protoff; dir = CTINFO2DIR(ctinfo); while (optoff < optend) { /* Usually: option, length. */ unsigned char *op = skb->data + optoff; switch (op[0]) { case TCPOPT_EOL: return 1; case TCPOPT_NOP: optoff++; continue; default: /* no partial options */ if (optoff + 1 == optend || optoff + op[1] > optend || op[1] < 2) return 0; if (op[0] == TCPOPT_SACK && op[1] >= 2+TCPOLEN_SACK_PERBLOCK && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) nf_ct_sack_block_adjust(skb, tcph, optoff + 2, optoff+op[1], &seqadj->seq[!dir]); optoff += op[1]; } } return 1; } /* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct tcphdr *tcph; __be32 newseq, newack; s32 seqoff, ackoff; struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way, *other_way; int res = 1; this_way = &seqadj->seq[dir]; other_way = &seqadj->seq[!dir]; if (skb_ensure_writable(skb, protoff + sizeof(*tcph))) return 0; tcph = (void *)skb->data + protoff; spin_lock_bh(&ct->lock); if (after(ntohl(tcph->seq), this_way->correction_pos)) seqoff = this_way->offset_after; else seqoff = this_way->offset_before; newseq = htonl(ntohl(tcph->seq) + seqoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); pr_debug("Adjusting sequence number from %u->%u\n", ntohl(tcph->seq), ntohl(newseq)); tcph->seq = newseq; if (!tcph->ack) goto out; if (after(ntohl(tcph->ack_seq) - other_way->offset_before, other_way->correction_pos)) ackoff = other_way->offset_after; else ackoff = other_way->offset_before; newack = htonl(ntohl(tcph->ack_seq) - ackoff); inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, false); pr_debug("Adjusting ack number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), ntohl(newack)); tcph->ack_seq = newack; res = nf_ct_sack_adjust(skb, protoff, ct, ctinfo); out: spin_unlock_bh(&ct->lock); return res; } EXPORT_SYMBOL_GPL(nf_ct_seq_adjust); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *this_way; if (!seqadj) return 0; this_way = &seqadj->seq[dir]; return after(seq, this_way->correction_pos) ? this_way->offset_after : this_way->offset_before; } EXPORT_SYMBOL_GPL(nf_ct_seq_offset); |
4 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_NFLOG.h> #include <net/netfilter/nf_log.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: packet logging to netlink using NFLOG"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct net *net = xt_net(par); struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; li.u.ulog.flags = 0; if (info->flags & XT_NFLOG_F_COPY_LEN) li.u.ulog.flags |= NF_LOG_F_COPY_LEN; nf_log_packet(net, xt_family(par), xt_hooknum(par), skb, xt_in(par), xt_out(par), &li, "%s", info->prefix); return XT_CONTINUE; } static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); if (ret != 0 && !par->nft_compat) { request_module("%s", "nfnetlink_log"); ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); } return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) { nf_logger_put(par->family, NF_LOG_TYPE_ULOG); } static struct xt_target nflog_tg_reg[] __read_mostly = { { .name = "NFLOG", .revision = 0, .family = NFPROTO_IPV4, .checkentry = nflog_tg_check, .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "NFLOG", .revision = 0, .family = NFPROTO_IPV6, .checkentry = nflog_tg_check, .destroy = nflog_tg_destroy, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), .me = THIS_MODULE, }, #endif }; static int __init nflog_tg_init(void) { return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } static void __exit nflog_tg_exit(void) { xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); } module_init(nflog_tg_init); module_exit(nflog_tg_exit); MODULE_SOFTDEP("pre: nfnetlink_log"); |
2 1 2 10 10 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/errno.h> #include <linux/list.h> #include <linux/moduleparam.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/skbuff.h> #include <linux/stddef.h> #include <linux/string.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "netlink.h" char batadv_routing_algo[20] = "BATMAN_IV"; static struct hlist_head batadv_algo_list; /** * batadv_algo_init() - Initialize batman-adv algorithm management data * structures */ void batadv_algo_init(void) { INIT_HLIST_HEAD(&batadv_algo_list); } /** * batadv_algo_get() - Search for algorithm with specific name * @name: algorithm name to find * * Return: Pointer to batadv_algo_ops on success, NULL otherwise */ struct batadv_algo_ops *batadv_algo_get(const char *name) { struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp; hlist_for_each_entry(bat_algo_ops_tmp, &batadv_algo_list, list) { if (strcmp(bat_algo_ops_tmp->name, name) != 0) continue; bat_algo_ops = bat_algo_ops_tmp; break; } return bat_algo_ops; } /** * batadv_algo_register() - Register callbacks for a mesh algorithm * @bat_algo_ops: mesh algorithm callbacks to add * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) { struct batadv_algo_ops *bat_algo_ops_tmp; bat_algo_ops_tmp = batadv_algo_get(bat_algo_ops->name); if (bat_algo_ops_tmp) { pr_info("Trying to register already registered routing algorithm: %s\n", bat_algo_ops->name); return -EEXIST; } /* all algorithms must implement all ops (for now) */ if (!bat_algo_ops->iface.enable || !bat_algo_ops->iface.disable || !bat_algo_ops->iface.update_mac || !bat_algo_ops->iface.primary_set || !bat_algo_ops->neigh.cmp || !bat_algo_ops->neigh.is_similar_or_better) { pr_info("Routing algo '%s' does not implement required ops\n", bat_algo_ops->name); return -EINVAL; } INIT_HLIST_NODE(&bat_algo_ops->list); hlist_add_head(&bat_algo_ops->list, &batadv_algo_list); return 0; } /** * batadv_algo_select() - Select algorithm of soft interface * @bat_priv: the bat priv with all the soft interface information * @name: name of the algorithm to select * * The algorithm callbacks for the soft interface will be set when the algorithm * with the correct name was found. Any previous selected algorithm will not be * deinitialized and the new selected algorithm will also not be initialized. * It is therefore not allowed to call batadv_algo_select outside the creation * function of the soft interface. * * Return: 0 on success or negative error number in case of failure */ int batadv_algo_select(struct batadv_priv *bat_priv, const char *name) { struct batadv_algo_ops *bat_algo_ops; bat_algo_ops = batadv_algo_get(name); if (!bat_algo_ops) return -EINVAL; bat_priv->algo_ops = bat_algo_ops; return 0; } static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) { struct batadv_algo_ops *bat_algo_ops; char *algo_name = (char *)val; size_t name_len = strlen(algo_name); if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); if (!bat_algo_ops) { pr_err("Routing algorithm '%s' is not supported\n", algo_name); return -EINVAL; } return param_set_copystring(algo_name, kp); } static const struct kernel_param_ops batadv_param_ops_ra = { .set = batadv_param_set_ra, .get = param_get_string, }; static struct kparam_string batadv_param_string_ra = { .maxlen = sizeof(batadv_routing_algo), .string = batadv_routing_algo, }; module_param_cb(routing_algo, &batadv_param_ops_ra, &batadv_param_string_ra, 0644); /** * batadv_algo_dump_entry() - fill in information about one supported routing * algorithm * @msg: netlink message to be sent back * @portid: Port to reply to * @seq: Sequence number of message * @bat_algo_ops: Algorithm to be dumped * * Return: Error number, or 0 on success */ static int batadv_algo_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_algo_ops *bat_algo_ops) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ROUTING_ALGOS); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_algo_ops->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_algo_dump() - fill in information about supported routing * algorithms * @msg: netlink message to be sent back * @cb: Parameters to the netlink request * * Return: Length of reply message. */ int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_algo_ops *bat_algo_ops; int skip = cb->args[0]; int i = 0; hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { if (i++ < skip) continue; if (batadv_algo_dump_entry(msg, portid, cb->nlh->nlmsg_seq, bat_algo_ops)) { i--; break; } } cb->args[0] = i; return msg->len; } |
522 494 12 482 485 480 199 3 193 196 579 46 579 579 1 577 582 578 580 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/main.c - Where the driver meets power management. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * * The driver model core calls device_pm_add() when a device is registered. * This will initialize the embedded device_pm_info object in the device * and add it to the list of power-controlled devices. sysfs entries for * controlling device power management will also be added. * * A separate list is used for keeping track of power info, because the power * domain dependencies may differ from the ancestral dependencies that the * subsystem list maintains. */ #define pr_fmt(fmt) "PM: " fmt #define dev_fmt pr_fmt #include <linux/device.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/pm_runtime.h> #include <linux/pm-trace.h> #include <linux/pm_wakeirq.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/async.h> #include <linux/suspend.h> #include <trace/events/power.h> #include <linux/cpufreq.h> #include <linux/devfreq.h> #include <linux/timer.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); #define list_for_each_entry_rcu_locked(pos, head, member) \ list_for_each_entry_rcu(pos, head, member, \ device_links_read_lock_held()) /* * The entries in the dpm_list list are in a depth first order, simply * because children are guaranteed to be discovered after parents, and * are inserted at the back of the list on discovery. * * Since device_pm_add() may be called with a device lock held, * we must never try to acquire a device lock while holding * dpm_list_mutex. */ LIST_HEAD(dpm_list); static LIST_HEAD(dpm_prepared_list); static LIST_HEAD(dpm_suspended_list); static LIST_HEAD(dpm_late_early_list); static LIST_HEAD(dpm_noirq_list); static DEFINE_MUTEX(dpm_list_mtx); static pm_message_t pm_transition; static int async_error; static const char *pm_verb(int event) { switch (event) { case PM_EVENT_SUSPEND: return "suspend"; case PM_EVENT_RESUME: return "resume"; case PM_EVENT_FREEZE: return "freeze"; case PM_EVENT_QUIESCE: return "quiesce"; case PM_EVENT_HIBERNATE: return "hibernate"; case PM_EVENT_THAW: return "thaw"; case PM_EVENT_RESTORE: return "restore"; case PM_EVENT_RECOVER: return "recover"; default: return "(unknown PM event)"; } } /** * device_pm_sleep_init - Initialize system suspend-related device fields. * @dev: Device object being initialized. */ void device_pm_sleep_init(struct device *dev) { dev->power.is_prepared = false; dev->power.is_suspended = false; dev->power.is_noirq_suspended = false; dev->power.is_late_suspended = false; init_completion(&dev->power.completion); complete_all(&dev->power.completion); dev->power.wakeup = NULL; INIT_LIST_HEAD(&dev->power.entry); } /** * device_pm_lock - Lock the list of active devices used by the PM core. */ void device_pm_lock(void) { mutex_lock(&dpm_list_mtx); } /** * device_pm_unlock - Unlock the list of active devices used by the PM core. */ void device_pm_unlock(void) { mutex_unlock(&dpm_list_mtx); } /** * device_pm_add - Add a device to the PM core's list of active devices. * @dev: Device to add to the list. */ void device_pm_add(struct device *dev) { /* Skip PM setup/initialization. */ if (device_pm_not_required(dev)) return; pr_debug("Adding info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); device_pm_check_callbacks(dev); mutex_lock(&dpm_list_mtx); if (dev->parent && dev->parent->power.is_prepared) dev_warn(dev, "parent %s should not be sleeping\n", dev_name(dev->parent)); list_add_tail(&dev->power.entry, &dpm_list); dev->power.in_dpm_list = true; mutex_unlock(&dpm_list_mtx); } /** * device_pm_remove - Remove a device from the PM core's list of active devices. * @dev: Device to be removed from the list. */ void device_pm_remove(struct device *dev) { if (device_pm_not_required(dev)) return; pr_debug("Removing info for %s:%s\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); complete_all(&dev->power.completion); mutex_lock(&dpm_list_mtx); list_del_init(&dev->power.entry); dev->power.in_dpm_list = false; mutex_unlock(&dpm_list_mtx); device_wakeup_disable(dev); pm_runtime_remove(dev); device_pm_check_callbacks(dev); } /** * device_pm_move_before - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come before. */ void device_pm_move_before(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s before %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert before devb. */ list_move_tail(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_after - Move device in the PM core's list of active devices. * @deva: Device to move in dpm_list. * @devb: Device @deva should come after. */ void device_pm_move_after(struct device *deva, struct device *devb) { pr_debug("Moving %s:%s after %s:%s\n", deva->bus ? deva->bus->name : "No Bus", dev_name(deva), devb->bus ? devb->bus->name : "No Bus", dev_name(devb)); /* Delete deva from dpm_list and reinsert after devb. */ list_move(&deva->power.entry, &devb->power.entry); } /** * device_pm_move_last - Move device to end of the PM core's list of devices. * @dev: Device to move in dpm_list. */ void device_pm_move_last(struct device *dev) { pr_debug("Moving %s:%s to end of list\n", dev->bus ? dev->bus->name : "No Bus", dev_name(dev)); list_move_tail(&dev->power.entry, &dpm_list); } static ktime_t initcall_debug_start(struct device *dev, void *cb) { if (!pm_print_times_enabled) return 0; dev_info(dev, "calling %ps @ %i, parent: %s\n", cb, task_pid_nr(current), dev->parent ? dev_name(dev->parent) : "none"); return ktime_get(); } static void initcall_debug_report(struct device *dev, ktime_t calltime, void *cb, int error) { ktime_t rettime; if (!pm_print_times_enabled) return; rettime = ktime_get(); dev_info(dev, "%ps returned %d after %Ld usecs\n", cb, error, (unsigned long long)ktime_us_delta(rettime, calltime)); } /** * dpm_wait - Wait for a PM operation to complete. * @dev: Device to wait for. * @async: If unset, wait only if the device's power.async_suspend flag is set. */ static void dpm_wait(struct device *dev, bool async) { if (!dev) return; if (async || (pm_async_enabled && dev->power.async_suspend)) wait_for_completion(&dev->power.completion); } static int dpm_wait_fn(struct device *dev, void *async_ptr) { dpm_wait(dev, *((bool *)async_ptr)); return 0; } static void dpm_wait_for_children(struct device *dev, bool async) { device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * If the supplier goes away right after we've checked the link to it, * we'll wait for its completion to change the state, but that's fine, * because the only things that will block as a result are the SRCU * callbacks freeing the link objects for the links in the list we're * walking. */ list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->supplier, async); device_links_read_unlock(idx); } static bool dpm_wait_for_superior(struct device *dev, bool async) { struct device *parent; /* * If the device is resumed asynchronously and the parent's callback * deletes both the device and the parent itself, the parent object may * be freed while this function is running, so avoid that by reference * counting the parent once more unless the device has been deleted * already (in which case return right away). */ mutex_lock(&dpm_list_mtx); if (!device_pm_initialized(dev)) { mutex_unlock(&dpm_list_mtx); return false; } parent = get_device(dev->parent); mutex_unlock(&dpm_list_mtx); dpm_wait(parent, async); put_device(parent); dpm_wait_for_suppliers(dev, async); /* * If the parent's callback has deleted the device, attempting to resume * it would be invalid, so avoid doing that then. */ return device_pm_initialized(dev); } static void dpm_wait_for_consumers(struct device *dev, bool async) { struct device_link *link; int idx; idx = device_links_read_lock(); /* * The status of a device link can only be changed from "dormant" by a * probe, but that cannot happen during system suspend/resume. In * theory it can change to "dormant" at that time, but then it is * reasonable to wait for the target device anyway (eg. if it goes * away, it's better to wait for it to go away completely and then * continue instead of trying to continue in parallel with its * unregistration). */ list_for_each_entry_rcu_locked(link, &dev->links.consumers, s_node) if (READ_ONCE(link->status) != DL_STATE_DORMANT) dpm_wait(link->consumer, async); device_links_read_unlock(idx); } static void dpm_wait_for_subordinate(struct device *dev, bool async) { dpm_wait_for_children(dev, async); dpm_wait_for_consumers(dev, async); } /** * pm_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. */ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend; case PM_EVENT_RESUME: return ops->resume; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze; case PM_EVENT_HIBERNATE: return ops->poweroff; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw; case PM_EVENT_RESTORE: return ops->restore; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_late_early_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * Runtime PM is disabled for @dev while this function is being executed. */ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_late; case PM_EVENT_RESUME: return ops->resume_early; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_late; case PM_EVENT_HIBERNATE: return ops->poweroff_late; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_early; case PM_EVENT_RESTORE: return ops->restore_early; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. * @state: PM transition of the system being carried out. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t state) { switch (state.event) { #ifdef CONFIG_SUSPEND case PM_EVENT_SUSPEND: return ops->suspend_noirq; case PM_EVENT_RESUME: return ops->resume_noirq; #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_HIBERNATE_CALLBACKS case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return ops->freeze_noirq; case PM_EVENT_HIBERNATE: return ops->poweroff_noirq; case PM_EVENT_THAW: case PM_EVENT_RECOVER: return ops->thaw_noirq; case PM_EVENT_RESTORE: return ops->restore_noirq; #endif /* CONFIG_HIBERNATE_CALLBACKS */ } return NULL; } static void pm_dev_dbg(struct device *dev, pm_message_t state, const char *info) { dev_dbg(dev, "%s%s%s driver flags: %x\n", info, pm_verb(state.event), ((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ? ", may wakeup" : "", dev->power.driver_flags); } static void pm_dev_err(struct device *dev, pm_message_t state, const char *info, int error) { dev_err(dev, "failed to %s%s: error %d\n", pm_verb(state.event), info, error); } static void dpm_show_time(ktime_t starttime, pm_message_t state, int error, const char *info) { ktime_t calltime; u64 usecs64; int usecs; calltime = ktime_get(); usecs64 = ktime_to_ns(ktime_sub(calltime, starttime)); do_div(usecs64, NSEC_PER_USEC); usecs = usecs64; if (usecs == 0) usecs = 1; pm_pr_dbg("%s%s%s of devices %s after %ld.%03ld msecs\n", info ?: "", info ? " " : "", pm_verb(state.event), error ? "aborted" : "complete", usecs / USEC_PER_MSEC, usecs % USEC_PER_MSEC); } static int dpm_run_callback(pm_callback_t cb, struct device *dev, pm_message_t state, const char *info) { ktime_t calltime; int error; if (!cb) return 0; calltime = initcall_debug_start(dev, cb); pm_dev_dbg(dev, state, info); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } #ifdef CONFIG_DPM_WATCHDOG struct dpm_watchdog { struct device *dev; struct task_struct *tsk; struct timer_list timer; }; #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) \ struct dpm_watchdog wd /** * dpm_watchdog_handler - Driver suspend / resume watchdog handler. * @t: The timer that PM watchdog depends on. * * Called when a driver has timed out suspending or resuming. * There's not much we can do here to recover so panic() to * capture a crash-dump in pstore. */ static void dpm_watchdog_handler(struct timer_list *t) { struct dpm_watchdog *wd = from_timer(wd, t, timer); dev_emerg(wd->dev, "**** DPM device timeout ****\n"); show_stack(wd->tsk, NULL, KERN_EMERG); panic("%s %s: unrecoverable failure\n", dev_driver_string(wd->dev), dev_name(wd->dev)); } /** * dpm_watchdog_set - Enable pm watchdog for given device. * @wd: Watchdog. Must be allocated on the stack. * @dev: Device to handle. */ static void dpm_watchdog_set(struct dpm_watchdog *wd, struct device *dev) { struct timer_list *timer = &wd->timer; wd->dev = dev; wd->tsk = current; timer_setup_on_stack(timer, dpm_watchdog_handler, 0); /* use same timeout value for both suspend and resume */ timer->expires = jiffies + HZ * CONFIG_DPM_WATCHDOG_TIMEOUT; add_timer(timer); } /** * dpm_watchdog_clear - Disable suspend/resume watchdog. * @wd: Watchdog to disable. */ static void dpm_watchdog_clear(struct dpm_watchdog *wd) { struct timer_list *timer = &wd->timer; del_timer_sync(timer); destroy_timer_on_stack(timer); } #else #define DECLARE_DPM_WATCHDOG_ON_STACK(wd) #define dpm_watchdog_set(x, y) #define dpm_watchdog_clear(x) #endif /*------------------------- Resume routines -------------------------*/ /** * dev_pm_skip_resume - System-wide device resume optimization check. * @dev: Target device. * * Return: * - %false if the transition under way is RESTORE. * - Return value of dev_pm_skip_suspend() if the transition under way is THAW. * - The logical negation of %power.must_resume otherwise (that is, when the * transition under way is RESUME). */ bool dev_pm_skip_resume(struct device *dev) { if (pm_transition.event == PM_EVENT_RESTORE) return false; if (pm_transition.event == PM_EVENT_THAW) return dev_pm_skip_suspend(dev); return !dev->power.must_resume; } static bool is_async(struct device *dev) { return dev->power.async_suspend && pm_async_enabled && !pm_trace_is_enabled(); } static bool dpm_async_fn(struct device *dev, async_func_t func) { reinit_completion(&dev->power.completion); if (is_async(dev)) { dev->power.async_in_progress = true; get_device(dev); if (async_schedule_dev_nocall(func, dev)) return true; put_device(dev); } /* * Because async_schedule_dev_nocall() above has returned false or it * has not been called at all, func() is not running and it is safe to * update the async_in_progress flag without extra synchronization. */ dev->power.async_in_progress = false; return false; } /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static void device_resume_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; bool skip_resume; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_noirq_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; skip_resume = dev_pm_skip_resume(dev); /* * If the driver callback is skipped below or by the middle layer * callback and device_resume_early() also skips the driver callback for * this device later, it needs to appear as "suspended" to PM-runtime, * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime * status to "active", but do that only if DPM_FLAG_SMART_SUSPEND is set * to avoid confusing drivers that don't use it. */ if (skip_resume) pm_runtime_set_suspended(dev); else if (dev_pm_skip_suspend(dev)) pm_runtime_set_active(dev); if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (skip_resume) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_noirq_suspended = false; Out: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } } static void async_resume_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_noirq(dev, pm_transition, true); put_device(dev); } static void dpm_noirq_resume_devices(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_noirq_list, power.entry) dpm_async_fn(dev, async_resume_noirq); while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); } /** * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * * Invoke the "noirq" resume callbacks for all devices in dpm_noirq_list and * allow device drivers' interrupt handlers to be called. */ void dpm_resume_noirq(pm_message_t state) { dpm_noirq_resume_devices(state); resume_device_irqs(); device_wakeup_disarm_wake_irqs(); } /** * device_resume_early - Execute an "early resume" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static void device_resume_early(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore || dev->power.direct_complete) goto Out; if (!dev->power.is_late_suspended) goto Out; if (!dpm_wait_for_superior(dev, async)) goto Out; if (dev->pm_domain) { info = "early power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "early type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "early class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "early bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_resume(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "early driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); Skip: dev->power.is_late_suspended = false; Out: TRACE_RESUME(error); pm_runtime_enable(dev); complete_all(&dev->power.completion); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } } static void async_resume_early(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume_early(dev, pm_transition, true); put_device(dev); } /** * dpm_resume_early - Execute "early resume" callbacks for all devices. * @state: PM transition of the system being carried out. */ void dpm_resume_early(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume_early"), state.event, true); async_error = 0; pm_transition = state; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_late_early_list, power.entry) dpm_async_fn(dev, async_resume_early); while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume_early(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); if (async_error) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); } /** * dpm_resume_start - Execute "noirq" and "early" device callbacks. * @state: PM transition of the system being carried out. */ void dpm_resume_start(pm_message_t state) { dpm_resume_noirq(state); dpm_resume_early(state); } EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being resumed asynchronously. */ static void device_resume(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_RESUME(0); if (dev->power.syscore) goto Complete; if (dev->power.direct_complete) { /* Match the pm_runtime_disable() in __device_suspend(). */ pm_runtime_enable(dev); goto Complete; } if (!dpm_wait_for_superior(dev, async)) goto Complete; dpm_watchdog_set(&wd, dev); device_lock(dev); /* * This is a fib. But we'll allow new children to be added below * a resumed device, even if the device hasn't been completed yet. */ dev->power.is_prepared = false; if (!dev->power.is_suspended) goto Unlock; if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Driver; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Driver; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Driver; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->resume) { info = "legacy bus "; callback = dev->bus->resume; goto End; } } Driver: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } End: error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; Unlock: device_unlock(dev); dpm_watchdog_clear(&wd); Complete: complete_all(&dev->power.completion); TRACE_RESUME(error); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } } static void async_resume(void *data, async_cookie_t cookie) { struct device *dev = data; device_resume(dev, pm_transition, true); put_device(dev); } /** * dpm_resume - Execute "resume" callbacks for non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the appropriate "resume" callback for all devices whose status * indicates that they are suspended. */ void dpm_resume(pm_message_t state) { struct device *dev; ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); might_sleep(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); /* * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ list_for_each_entry(dev, &dpm_suspended_list, power.entry) dpm_async_fn(dev, async_resume); while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); if (!dev->power.async_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); device_resume(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); } } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); if (async_error) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); devfreq_resume(); trace_suspend_resume(TPS("dpm_resume"), state.event, false); } /** * device_complete - Complete a PM transition for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. */ static void device_complete(struct device *dev, pm_message_t state) { void (*callback)(struct device *) = NULL; const char *info = NULL; if (dev->power.syscore) goto out; device_lock(dev); if (dev->pm_domain) { info = "completing power domain "; callback = dev->pm_domain->ops.complete; } else if (dev->type && dev->type->pm) { info = "completing type "; callback = dev->type->pm->complete; } else if (dev->class && dev->class->pm) { info = "completing class "; callback = dev->class->pm->complete; } else if (dev->bus && dev->bus->pm) { info = "completing bus "; callback = dev->bus->pm->complete; } if (!callback && dev->driver && dev->driver->pm) { info = "completing driver "; callback = dev->driver->pm->complete; } if (callback) { pm_dev_dbg(dev, state, info); callback(dev); } device_unlock(dev); out: pm_runtime_put(dev); } /** * dpm_complete - Complete a PM transition for all non-sysdev devices. * @state: PM transition of the system being carried out. * * Execute the ->complete() callbacks for all devices whose PM status is not * DPM_ON (this allows new devices to be registered). */ void dpm_complete(pm_message_t state) { struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); get_device(dev); dev->power.is_prepared = false; list_move(&dev->power.entry, &list); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); device_complete(dev, state); trace_device_pm_callback_end(dev, 0); put_device(dev); mutex_lock(&dpm_list_mtx); } list_splice(&list, &dpm_list); mutex_unlock(&dpm_list_mtx); /* Allow device probing and trigger re-probing of deferred devices */ device_unblock_probing(); trace_suspend_resume(TPS("dpm_complete"), state.event, false); } /** * dpm_resume_end - Execute "resume" callbacks and complete system transition. * @state: PM transition of the system being carried out. * * Execute "resume" callbacks for all devices and complete the PM transition of * the system. */ void dpm_resume_end(pm_message_t state) { dpm_resume(state); dpm_complete(state); } EXPORT_SYMBOL_GPL(dpm_resume_end); /*------------------------- Suspend routines -------------------------*/ /** * resume_event - Return a "resume" message for given "suspend" sleep state. * @sleep_state: PM message representing a sleep state. * * Return a PM message representing the resume event corresponding to given * sleep state. */ static pm_message_t resume_event(pm_message_t sleep_state) { switch (sleep_state.event) { case PM_EVENT_SUSPEND: return PMSG_RESUME; case PM_EVENT_FREEZE: case PM_EVENT_QUIESCE: return PMSG_RECOVER; case PM_EVENT_HIBERNATE: return PMSG_RESTORE; } return PMSG_ON; } static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; if (dev->parent) dev->parent->power.must_resume = true; idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; device_links_read_unlock(idx); } /** * device_suspend_noirq - Execute a "noirq suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * The driver of @dev will not receive interrupts while this function is being * executed. */ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; } Skip: dev->power.is_noirq_suspended = true; /* * Skipping the resume of devices that were in use right before the * system suspend (as indicated by their PM-runtime usage counters) * would be suboptimal. Also resume them if doing that is not allowed * to be skipped. */ if (atomic_read(&dev->power.usage_count) > 1 || !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && dev->power.may_skip_resume)) dev->power.must_resume = true; if (dev->power.must_resume) dpm_superior_set_must_resume(dev); Complete: complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend_noirq(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_noirq(dev, pm_transition, true); put_device(dev); } static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_late_early_list)) { struct device *dev = to_device(dpm_late_early_list.prev); list_move(&dev->power.entry, &dpm_noirq_list); if (dpm_async_fn(dev, async_suspend_noirq)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_show_time(starttime, state, error, "noirq"); trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, false); return error; } /** * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers' interrupt handlers from being called and invoke * "noirq" suspend callbacks for all non-sysdev devices. */ int dpm_suspend_noirq(pm_message_t state) { int ret; device_wakeup_arm_wake_irqs(); suspend_device_irqs(); ret = dpm_noirq_suspend_devices(state); if (ret) dpm_resume_noirq(resume_event(state)); return ret; } static void dpm_propagate_wakeup_to_parent(struct device *dev) { struct device *parent = dev->parent; if (!parent) return; spin_lock_irq(&parent->power.lock); if (device_wakeup_path(dev) && !parent->power.ignore_children) parent->power.wakeup_path = true; spin_unlock_irq(&parent->power.lock); } /** * device_suspend_late - Execute a "late suspend" callback for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. * * Runtime PM is disabled for @dev while this function is being executed. */ static int device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; TRACE_DEVICE(dev); TRACE_SUSPEND(0); __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); if (async_error) goto Complete; if (pm_wakeup_pending()) { async_error = -EBUSY; goto Complete; } if (dev->power.syscore || dev->power.direct_complete) goto Complete; if (dev->pm_domain) { info = "late power domain "; callback = pm_late_early_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { info = "late type "; callback = pm_late_early_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { info = "late class "; callback = pm_late_early_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { info = "late bus "; callback = pm_late_early_op(dev->bus->pm, state); } if (callback) goto Run; if (dev_pm_skip_suspend(dev)) goto Skip; if (dev->driver && dev->driver->pm) { info = "late driver "; callback = pm_late_early_op(dev->driver->pm, state); } Run: error = dpm_run_callback(callback, dev, state, info); if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; } dpm_propagate_wakeup_to_parent(dev); Skip: dev->power.is_late_suspended = true; Complete: TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } static void async_suspend_late(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend_late(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend_late - Execute "late suspend" callbacks for all devices. * @state: PM transition of the system being carried out. */ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); pm_transition = state; async_error = 0; wake_up_all_idle_cpus(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_suspended_list)) { struct device *dev = to_device(dpm_suspended_list.prev); list_move(&dev->power.entry, &dpm_late_early_list); if (dpm_async_fn(dev, async_suspend_late)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); } dpm_show_time(starttime, state, error, "late"); trace_suspend_resume(TPS("dpm_suspend_late"), state.event, false); return error; } /** * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. * @state: PM transition of the system being carried out. */ int dpm_suspend_end(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_suspend_late(state); if (error) goto out; error = dpm_suspend_noirq(state); if (error) dpm_resume_early(resume_event(state)); out: dpm_show_time(starttime, state, error, "end"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. * @dev: Device to suspend. * @state: PM transition of the system being carried out. * @cb: Suspend callback to execute. * @info: string description of caller. */ static int legacy_suspend(struct device *dev, pm_message_t state, int (*cb)(struct device *dev, pm_message_t state), const char *info) { int error; ktime_t calltime; calltime = initcall_debug_start(dev, cb); trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); return error; } static void dpm_clear_superiors_direct_complete(struct device *dev) { struct device_link *link; int idx; if (dev->parent) { spin_lock_irq(&dev->parent->power.lock); dev->parent->power.direct_complete = false; spin_unlock_irq(&dev->parent->power.lock); } idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { spin_lock_irq(&link->supplier->power.lock); link->supplier->power.direct_complete = false; spin_unlock_irq(&link->supplier->power.lock); } device_links_read_unlock(idx); } /** * device_suspend - Execute "suspend" callbacks for given device. * @dev: Device to handle. * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ static int device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); TRACE_DEVICE(dev); TRACE_SUSPEND(0); dpm_wait_for_subordinate(dev, async); if (async_error) { dev->power.direct_complete = false; goto Complete; } /* * Wait for possible runtime PM transitions of the device in progress * to complete and if there's a runtime resume request pending for it, * resume it before proceeding with invoking the system-wide suspend * callbacks for it. * * If the system-wide suspend callbacks below change the configuration * of the device, they must disable runtime PM for it or otherwise * ensure that its runtime-resume callbacks will not be confused by that * change in case they are invoked going forward. */ pm_runtime_barrier(dev); if (pm_wakeup_pending()) { dev->power.direct_complete = false; async_error = -EBUSY; goto Complete; } if (dev->power.syscore) goto Complete; /* Avoid direct_complete to let wakeup_path propagate. */ if (device_may_wakeup(dev) || device_wakeup_path(dev)) dev->power.direct_complete = false; if (dev->power.direct_complete) { if (pm_runtime_status_suspended(dev)) { pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); goto Complete; } pm_runtime_enable(dev); } dev->power.direct_complete = false; } dev->power.may_skip_resume = true; dev->power.must_resume = !dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME); dpm_watchdog_set(&wd, dev); device_lock(dev); if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); goto Run; } if (dev->type && dev->type->pm) { info = "type "; callback = pm_op(dev->type->pm, state); goto Run; } if (dev->class && dev->class->pm) { info = "class "; callback = pm_op(dev->class->pm, state); goto Run; } if (dev->bus) { if (dev->bus->pm) { info = "bus "; callback = pm_op(dev->bus->pm, state); } else if (dev->bus->suspend) { pm_dev_dbg(dev, state, "legacy bus "); error = legacy_suspend(dev, state, dev->bus->suspend, "legacy bus "); goto End; } } Run: if (!callback && dev->driver && dev->driver->pm) { info = "driver "; callback = pm_op(dev->driver->pm, state); } error = dpm_run_callback(callback, dev, state, info); End: if (!error) { dev->power.is_suspended = true; if (device_may_wakeup(dev)) dev->power.wakeup_path = true; dpm_propagate_wakeup_to_parent(dev); dpm_clear_superiors_direct_complete(dev); } device_unlock(dev); dpm_watchdog_clear(&wd); Complete: if (error) { async_error = error; dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } static void async_suspend(void *data, async_cookie_t cookie) { struct device *dev = data; device_suspend(dev, pm_transition, true); put_device(dev); } /** * dpm_suspend - Execute "suspend" callbacks for all non-sysdev devices. * @state: PM transition of the system being carried out. */ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); devfreq_suspend(); cpufreq_suspend(); pm_transition = state; async_error = 0; mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_prepared_list)) { struct device *dev = to_device(dpm_prepared_list.prev); list_move(&dev->power.entry, &dpm_suspended_list); if (dpm_async_fn(dev, async_suspend)) continue; get_device(dev); mutex_unlock(&dpm_list_mtx); error = device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); if (error || async_error) break; } mutex_unlock(&dpm_list_mtx); async_synchronize_full(); if (!error) error = async_error; if (error) dpm_save_failed_step(SUSPEND_SUSPEND); dpm_show_time(starttime, state, error, NULL); trace_suspend_resume(TPS("dpm_suspend"), state.event, false); return error; } /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for given device. No new children of the * device may be registered after this function has returned. */ static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; int ret = 0; /* * If a device's parent goes into runtime suspend at the wrong time, * it won't be possible to resume the device. To prevent this we * block runtime suspend here, during the prepare phase, and allow * it again during the complete phase. */ pm_runtime_get_noresume(dev); if (dev->power.syscore) return 0; device_lock(dev); dev->power.wakeup_path = false; if (dev->power.no_pm_callbacks) goto unlock; if (dev->pm_domain) callback = dev->pm_domain->ops.prepare; else if (dev->type && dev->type->pm) callback = dev->type->pm->prepare; else if (dev->class && dev->class->pm) callback = dev->class->pm->prepare; else if (dev->bus && dev->bus->pm) callback = dev->bus->pm->prepare; if (!callback && dev->driver && dev->driver->pm) callback = dev->driver->pm->prepare; if (callback) ret = callback(dev); unlock: device_unlock(dev); if (ret < 0) { suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is * runtime-suspended, you can leave it in that state provided that you * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); spin_unlock_irq(&dev->power.lock); return 0; } /** * dpm_prepare - Prepare all non-sysdev devices for a system PM transition. * @state: PM transition of the system being carried out. * * Execute the ->prepare() callback(s) for all devices. */ int dpm_prepare(pm_message_t state) { int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); might_sleep(); /* * Give a chance for the known devices to complete their probes, before * disable probing of devices. This sync point is important at least * at boot time + hibernation restore. */ wait_for_device_probe(); /* * It is unsafe if probing of devices will happen during suspend or * hibernation and system behavior will be unpredictable in this case. * So, let's prohibit device's probing here and defer their probes * instead. The normal behavior will be restored in dpm_complete(). */ device_block_probing(); mutex_lock(&dpm_list_mtx); while (!list_empty(&dpm_list) && !error) { struct device *dev = to_device(dpm_list.next); get_device(dev); mutex_unlock(&dpm_list_mtx); trace_device_pm_callback_start(dev, "", state.event); error = device_prepare(dev, state); trace_device_pm_callback_end(dev, error); mutex_lock(&dpm_list_mtx); if (!error) { dev->power.is_prepared = true; if (!list_empty(&dev->power.entry)) list_move_tail(&dev->power.entry, &dpm_prepared_list); } else if (error == -EAGAIN) { error = 0; } else { dev_info(dev, "not prepared for power transition: code %d\n", error); } mutex_unlock(&dpm_list_mtx); put_device(dev); mutex_lock(&dpm_list_mtx); } mutex_unlock(&dpm_list_mtx); trace_suspend_resume(TPS("dpm_prepare"), state.event, false); return error; } /** * dpm_suspend_start - Prepare devices for PM transition and suspend them. * @state: PM transition of the system being carried out. * * Prepare all non-sysdev devices for system PM transition and execute "suspend" * callbacks for them. */ int dpm_suspend_start(pm_message_t state) { ktime_t starttime = ktime_get(); int error; error = dpm_prepare(state); if (error) dpm_save_failed_step(SUSPEND_PREPARE); else error = dpm_suspend(state); dpm_show_time(starttime, state, error, "start"); return error; } EXPORT_SYMBOL_GPL(dpm_suspend_start); void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) dev_err(dev, "%s(): %ps returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); /** * device_pm_wait_for_dev - Wait for suspend/resume of a device to complete. * @subordinate: Device that needs to wait for @dev. * @dev: Device to wait for. */ int device_pm_wait_for_dev(struct device *subordinate, struct device *dev) { dpm_wait(dev, subordinate->power.async_suspend); return async_error; } EXPORT_SYMBOL_GPL(device_pm_wait_for_dev); /** * dpm_for_each_dev - device iterator. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over devices in dpm_list, and call @fn for each device, * passing it @data. */ void dpm_for_each_dev(void *data, void (*fn)(struct device *, void *)) { struct device *dev; if (!fn) return; device_pm_lock(); list_for_each_entry(dev, &dpm_list, power.entry) fn(dev, data); device_pm_unlock(); } EXPORT_SYMBOL_GPL(dpm_for_each_dev); static bool pm_ops_is_empty(const struct dev_pm_ops *ops) { if (!ops) return true; return !ops->prepare && !ops->suspend && !ops->suspend_late && !ops->suspend_noirq && !ops->resume_noirq && !ops->resume_early && !ops->resume && !ops->complete; } void device_pm_check_callbacks(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); dev->power.no_pm_callbacks = (!dev->bus || (pm_ops_is_empty(dev->bus->pm) && !dev->bus->suspend && !dev->bus->resume)) && (!dev->class || pm_ops_is_empty(dev->class->pm)) && (!dev->type || pm_ops_is_empty(dev->type->pm)) && (!dev->pm_domain || pm_ops_is_empty(&dev->pm_domain->ops)) && (!dev->driver || (pm_ops_is_empty(dev->driver->pm) && !dev->driver->suspend && !dev->driver->resume)); spin_unlock_irqrestore(&dev->power.lock, flags); } bool dev_pm_skip_suspend(struct device *dev) { return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && pm_runtime_status_suspended(dev); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLBFLUSH_H #define _ASM_X86_TLBFLUSH_H #include <linux/mm_types.h> #include <linux/mmu_notifier.h> #include <linux/sched.h> #include <asm/processor.h> #include <asm/cpufeature.h> #include <asm/special_insns.h> #include <asm/smp.h> #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h> #include <asm/pgtable.h> DECLARE_PER_CPU(u64, tlbstate_untag_mask); void __flush_tlb_all(void); #define TLB_FLUSH_ALL -1UL #define TLB_GENERATION_INVALID 0 void cr4_update_irqsoff(unsigned long set, unsigned long clear); unsigned long cr4_read_shadow(void); /* Set in this cpu's CR4. */ static inline void cr4_set_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(mask, 0); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits_irqsoff(unsigned long mask) { cr4_update_irqsoff(0, mask); } /* Set in this cpu's CR4. */ static inline void cr4_set_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_set_bits_irqsoff(mask); local_irq_restore(flags); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits(unsigned long mask) { unsigned long flags; local_irq_save(flags); cr4_clear_bits_irqsoff(mask); local_irq_restore(flags); } #ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache * lines. */ #define TLB_NR_DYN_ASIDS 6 struct tlb_context { u64 ctx_id; u64 tlb_gen; }; struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. * * During switch_mm_irqs_off(), loaded_mm will be set to * LOADED_MM_SWITCHING during the brief interrupts-off window * when CR3 and loaded_mm would otherwise be inconsistent. This * is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; #define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { struct mm_struct *last_user_mm; unsigned long last_user_mm_spec; }; u16 loaded_mm_asid; u16 next_asid; /* * If set we changed the page tables in such a way that we * needed an invalidation of all contexts (aka. PCIDs / ASIDs). * This tells us to go invalidate all the non-loaded ctxs[] * on the next context switch. * * The current ctx was kept up-to-date as it ran and does not * need to be invalidated. */ bool invalidate_other; #ifdef CONFIG_ADDRESS_MASKING /* * Active LAM mode. * * X86_CR3_LAM_U57/U48 shifted right by X86_CR3_LAM_U57_BIT or 0 if LAM * disabled. */ u8 lam; #endif /* * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate * the corresponding user PCID needs a flush next time we * switch to it; see SWITCH_TO_USER_CR3. */ unsigned short user_pcid_flush_mask; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; /* * This is a list of all contexts that might exist in the TLB. * There is one per ASID that we use, and the ASID (what the * CPU calls PCID) is the index into ctxts. * * For each context, ctx_id indicates which mm the TLB's user * entries came from. As an invariant, the TLB will never * contain entries that are out-of-date as when that mm reached * the tlb_gen in the list. * * To be clear, this means that it's legal for the TLB code to * flush the TLB without updating tlb_gen. This can happen * (for now, at least) due to paravirt remote flushes. * * NB: context 0 is a bit special, since it's also used by * various bits of init code. This is fine -- code that * isn't aware of PCID will end up harmlessly flushing * context 0. */ struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate); struct tlb_state_shared { /* * We can be in one of several states: * * - Actively using an mm. Our CPU's bit will be set in * mm_cpumask(loaded_mm) and is_lazy == false; * * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit * will not be set in mm_cpumask(&init_mm) and is_lazy == false. * * - Lazily using a real mm. loaded_mm != &init_mm, our bit * is set in mm_cpumask(loaded_mm), but is_lazy == true. * We're heuristically guessing that the CR3 load we * skipped more than makes up for the overhead added by * lazy mode. */ bool is_lazy; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; extern void initialize_tlbstate_and_flush(void); /* * TLB flushing: * * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page * - flush_tlb_range(vma, start, end) flushes a range of pages * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus * * ..but the i386 has somewhat limited tlb flushing capabilities, * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { /* * We support several kinds of flushes. * * - Fully flush a single mm. .mm will be set, .end will be * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to * which the IPI sender is trying to catch us up. * * - Partially flush a single mm. .mm will be set, .start and * .end will indicate the range, and .new_tlb_gen will be set * such that the changes between generation .new_tlb_gen-1 and * .new_tlb_gen are entirely contained in the indicated range. * * - Fully flush all mms whose tlb_gens have been updated. .mm * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen * will be zero. */ struct mm_struct *mm; unsigned long start; unsigned long end; u64 new_tlb_gen; unsigned int initiating_cpu; u8 stride_shift; u8 freed_tables; }; void flush_tlb_local(void); void flush_tlb_one_user(unsigned long addr); void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) #define flush_tlb_range(vma, start, end) \ flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ : PAGE_SHIFT, false) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) { flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { bool should_defer = false; /* If remote CPUs need to be flushed then defer batch the flush */ if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) should_defer = true; put_cpu(); return should_defer; } static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* * Bump the generation count. This also serves as a full barrier * that synchronizes with switch_mm(): callers are required to order * their read of mm_cpumask after their writes to the paging * structures. */ return atomic64_inc_return(&mm->context.tlb_gen); } static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm, unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, unsigned long newflags, bool ignore_access) { /* * Flags that require a flush when cleared but not when they are set. * Only include flags that would not trigger spurious page-faults. * Non-present entries are not cached. Hardware would set the * dirty/access bit if needed without a fault. */ const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED; const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3 | _PAGE_SOFTW4 | _PAGE_SAVED_DIRTY; const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; unsigned long diff = oldflags ^ newflags; BUILD_BUG_ON(flush_on_clear & software_flags); BUILD_BUG_ON(flush_on_clear & flush_on_change); BUILD_BUG_ON(flush_on_change & software_flags); /* Ignore software flags */ diff &= ~software_flags; if (ignore_access) diff &= ~_PAGE_ACCESSED; /* * Did any of the 'flush_on_clear' flags was clleared set from between * 'oldflags' and 'newflags'? */ if (diff & oldflags & flush_on_clear) return true; /* Flush on modified flags. */ if (diff & flush_on_change) return true; /* Ensure there are no flags that were left behind */ if (IS_ENABLED(CONFIG_DEBUG_VM) && (diff & ~(flush_on_clear | software_flags | flush_on_change))) { VM_WARN_ON_ONCE(1); return true; } return false; } /* * pte_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace PTEs. */ static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) { /* !PRESENT -> * ; no need for flush */ if (!(pte_flags(oldpte) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pte_pfn(oldpte) != pte_pfn(newpte)) return true; /* * check PTE flags; ignore access-bit; see comment in * ptep_clear_flush_young(). */ return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), true); } #define pte_needs_flush pte_needs_flush /* * huge_pmd_needs_flush() checks whether permissions were demoted and require a * flush. It should only be used for userspace huge PMDs. */ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) { /* !PRESENT -> * ; no need for flush */ if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) return false; /* PFN changed ; needs flush */ if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) return true; /* * check PMD flags; do not ignore access-bit; see * pmdp_clear_flush_young(). */ return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), false); } #define huge_pmd_needs_flush huge_pmd_needs_flush #ifdef CONFIG_ADDRESS_MASKING static inline u64 tlbstate_lam_cr3_mask(void) { u64 lam = this_cpu_read(cpu_tlbstate.lam); return lam << X86_CR3_LAM_U57_BIT; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT); this_cpu_write(tlbstate_untag_mask, untag_mask); } #else static inline u64 tlbstate_lam_cr3_mask(void) { return 0; } static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) { native_write_cr4(cr4 ^ X86_CR4_PGE); native_write_cr4(cr4); } #endif /* _ASM_X86_TLBFLUSH_H */ |
5 1 1 1 1 2 1 1 10 10 6 3 8 19 19 8 13 10 10 10 10 10 10 1 9 9 9 10 16 16 16 8 9 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "device.h" #include "peer.h" #include "socket.h" #include "queueing.h" #include "messages.h" #include <linux/ctype.h> #include <linux/net.h> #include <linux/if_vlan.h> #include <linux/if_ether.h> #include <linux/inetdevice.h> #include <net/udp_tunnel.h> #include <net/ipv6.h> static int send4(struct wg_device *wg, struct sk_buff *skb, struct endpoint *endpoint, u8 ds, struct dst_cache *cache) { struct flowi4 fl = { .saddr = endpoint->src4.s_addr, .daddr = endpoint->addr4.sin_addr.s_addr, .fl4_dport = endpoint->addr4.sin_port, .flowi4_mark = wg->fwmark, .flowi4_proto = IPPROTO_UDP }; struct rtable *rt = NULL; struct sock *sock; int ret = 0; skb_mark_not_on_list(skb); skb->dev = wg->dev; skb->mark = wg->fwmark; rcu_read_lock_bh(); sock = rcu_dereference_bh(wg->sock4); if (unlikely(!sock)) { ret = -ENONET; goto err; } fl.fl4_sport = inet_sk(sock)->inet_sport; if (cache) rt = dst_cache_get_ip4(cache, &fl.saddr); if (!rt) { security_sk_classify_flow(sock, flowi4_to_flowi_common(&fl)); if (unlikely(!inet_confirm_addr(sock_net(sock), NULL, 0, fl.saddr, RT_SCOPE_HOST))) { endpoint->src4.s_addr = 0; endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); } rt = ip_route_output_flow(sock_net(sock), &fl, sock); if (unlikely(endpoint->src_if4 && ((IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) || (!IS_ERR(rt) && rt->dst.dev->ifindex != endpoint->src_if4)))) { endpoint->src4.s_addr = 0; endpoint->src_if4 = 0; fl.saddr = 0; if (cache) dst_cache_reset(cache); if (!IS_ERR(rt)) ip_rt_put(rt); rt = ip_route_output_flow(sock_net(sock), &fl, sock); } if (IS_ERR(rt)) { ret = PTR_ERR(rt); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; } if (cache) dst_cache_set_ip4(cache, &rt->dst, fl.saddr); } skb->ignore_df = 1; udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, fl.fl4_dport, false, false); goto out; err: kfree_skb(skb); out: rcu_read_unlock_bh(); return ret; } static int send6(struct wg_device *wg, struct sk_buff *skb, struct endpoint *endpoint, u8 ds, struct dst_cache *cache) { #if IS_ENABLED(CONFIG_IPV6) struct flowi6 fl = { .saddr = endpoint->src6, .daddr = endpoint->addr6.sin6_addr, .fl6_dport = endpoint->addr6.sin6_port, .flowi6_mark = wg->fwmark, .flowi6_oif = endpoint->addr6.sin6_scope_id, .flowi6_proto = IPPROTO_UDP /* TODO: addr->sin6_flowinfo */ }; struct dst_entry *dst = NULL; struct sock *sock; int ret = 0; skb_mark_not_on_list(skb); skb->dev = wg->dev; skb->mark = wg->fwmark; rcu_read_lock_bh(); sock = rcu_dereference_bh(wg->sock6); if (unlikely(!sock)) { ret = -ENONET; goto err; } fl.fl6_sport = inet_sk(sock)->inet_sport; if (cache) dst = dst_cache_get_ip6(cache, &fl.saddr); if (!dst) { security_sk_classify_flow(sock, flowi6_to_flowi_common(&fl)); if (unlikely(!ipv6_addr_any(&fl.saddr) && !ipv6_chk_addr(sock_net(sock), &fl.saddr, NULL, 0))) { endpoint->src6 = fl.saddr = in6addr_any; if (cache) dst_cache_reset(cache); } dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sock), sock, &fl, NULL); if (IS_ERR(dst)) { ret = PTR_ERR(dst); net_dbg_ratelimited("%s: No route to %pISpfsc, error %d\n", wg->dev->name, &endpoint->addr, ret); goto err; } if (cache) dst_cache_set_ip6(cache, dst, &fl.saddr); } skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, fl.fl6_dport, false); goto out; err: kfree_skb(skb); out: rcu_read_unlock_bh(); return ret; #else kfree_skb(skb); return -EAFNOSUPPORT; #endif } int wg_socket_send_skb_to_peer(struct wg_peer *peer, struct sk_buff *skb, u8 ds) { size_t skb_len = skb->len; int ret = -EAFNOSUPPORT; read_lock_bh(&peer->endpoint_lock); if (peer->endpoint.addr.sa_family == AF_INET) ret = send4(peer->device, skb, &peer->endpoint, ds, &peer->endpoint_cache); else if (peer->endpoint.addr.sa_family == AF_INET6) ret = send6(peer->device, skb, &peer->endpoint, ds, &peer->endpoint_cache); else dev_kfree_skb(skb); if (likely(!ret)) peer->tx_bytes += skb_len; read_unlock_bh(&peer->endpoint_lock); return ret; } int wg_socket_send_buffer_to_peer(struct wg_peer *peer, void *buffer, size_t len, u8 ds) { struct sk_buff *skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC); if (unlikely(!skb)) return -ENOMEM; skb_reserve(skb, SKB_HEADER_LEN); skb_set_inner_network_header(skb, 0); skb_put_data(skb, buffer, len); return wg_socket_send_skb_to_peer(peer, skb, ds); } int wg_socket_send_buffer_as_reply_to_skb(struct wg_device *wg, struct sk_buff *in_skb, void *buffer, size_t len) { int ret = 0; struct sk_buff *skb; struct endpoint endpoint; if (unlikely(!in_skb)) return -EINVAL; ret = wg_socket_endpoint_from_skb(&endpoint, in_skb); if (unlikely(ret < 0)) return ret; skb = alloc_skb(len + SKB_HEADER_LEN, GFP_ATOMIC); if (unlikely(!skb)) return -ENOMEM; skb_reserve(skb, SKB_HEADER_LEN); skb_set_inner_network_header(skb, 0); skb_put_data(skb, buffer, len); if (endpoint.addr.sa_family == AF_INET) ret = send4(wg, skb, &endpoint, 0, NULL); else if (endpoint.addr.sa_family == AF_INET6) ret = send6(wg, skb, &endpoint, 0, NULL); /* No other possibilities if the endpoint is valid, which it is, * as we checked above. */ return ret; } int wg_socket_endpoint_from_skb(struct endpoint *endpoint, const struct sk_buff *skb) { memset(endpoint, 0, sizeof(*endpoint)); if (skb->protocol == htons(ETH_P_IP)) { endpoint->addr4.sin_family = AF_INET; endpoint->addr4.sin_port = udp_hdr(skb)->source; endpoint->addr4.sin_addr.s_addr = ip_hdr(skb)->saddr; endpoint->src4.s_addr = ip_hdr(skb)->daddr; endpoint->src_if4 = skb->skb_iif; } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { endpoint->addr6.sin6_family = AF_INET6; endpoint->addr6.sin6_port = udp_hdr(skb)->source; endpoint->addr6.sin6_addr = ipv6_hdr(skb)->saddr; endpoint->addr6.sin6_scope_id = ipv6_iface_scope_id( &ipv6_hdr(skb)->saddr, skb->skb_iif); endpoint->src6 = ipv6_hdr(skb)->daddr; } else { return -EINVAL; } return 0; } static bool endpoint_eq(const struct endpoint *a, const struct endpoint *b) { return (a->addr.sa_family == AF_INET && b->addr.sa_family == AF_INET && a->addr4.sin_port == b->addr4.sin_port && a->addr4.sin_addr.s_addr == b->addr4.sin_addr.s_addr && a->src4.s_addr == b->src4.s_addr && a->src_if4 == b->src_if4) || (a->addr.sa_family == AF_INET6 && b->addr.sa_family == AF_INET6 && a->addr6.sin6_port == b->addr6.sin6_port && ipv6_addr_equal(&a->addr6.sin6_addr, &b->addr6.sin6_addr) && a->addr6.sin6_scope_id == b->addr6.sin6_scope_id && ipv6_addr_equal(&a->src6, &b->src6)) || unlikely(!a->addr.sa_family && !b->addr.sa_family); } void wg_socket_set_peer_endpoint(struct wg_peer *peer, const struct endpoint *endpoint) { /* First we check unlocked, in order to optimize, since it's pretty rare * that an endpoint will change. If we happen to be mid-write, and two * CPUs wind up writing the same thing or something slightly different, * it doesn't really matter much either. */ if (endpoint_eq(endpoint, &peer->endpoint)) return; write_lock_bh(&peer->endpoint_lock); if (endpoint->addr.sa_family == AF_INET) { peer->endpoint.addr4 = endpoint->addr4; peer->endpoint.src4 = endpoint->src4; peer->endpoint.src_if4 = endpoint->src_if4; } else if (IS_ENABLED(CONFIG_IPV6) && endpoint->addr.sa_family == AF_INET6) { peer->endpoint.addr6 = endpoint->addr6; peer->endpoint.src6 = endpoint->src6; } else { goto out; } dst_cache_reset(&peer->endpoint_cache); out: write_unlock_bh(&peer->endpoint_lock); } void wg_socket_set_peer_endpoint_from_skb(struct wg_peer *peer, const struct sk_buff *skb) { struct endpoint endpoint; if (!wg_socket_endpoint_from_skb(&endpoint, skb)) wg_socket_set_peer_endpoint(peer, &endpoint); } void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer) { write_lock_bh(&peer->endpoint_lock); memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6)); dst_cache_reset_now(&peer->endpoint_cache); write_unlock_bh(&peer->endpoint_lock); } static int wg_receive(struct sock *sk, struct sk_buff *skb) { struct wg_device *wg; if (unlikely(!sk)) goto err; wg = sk->sk_user_data; if (unlikely(!wg)) goto err; skb_mark_not_on_list(skb); wg_packet_receive(wg, skb); return 0; err: kfree_skb(skb); return 0; } static void sock_free(struct sock *sock) { if (unlikely(!sock)) return; sk_clear_memalloc(sock); udp_tunnel_sock_release(sock->sk_socket); } static void set_sock_opts(struct socket *sock) { sock->sk->sk_allocation = GFP_ATOMIC; sock->sk->sk_sndbuf = INT_MAX; sk_set_memalloc(sock->sk); } int wg_socket_init(struct wg_device *wg, u16 port) { struct net *net; int ret; struct udp_tunnel_sock_cfg cfg = { .sk_user_data = wg, .encap_type = 1, .encap_rcv = wg_receive }; struct socket *new4 = NULL, *new6 = NULL; struct udp_port_cfg port4 = { .family = AF_INET, .local_ip.s_addr = htonl(INADDR_ANY), .local_udp_port = htons(port), .use_udp_checksums = true }; #if IS_ENABLED(CONFIG_IPV6) int retries = 0; struct udp_port_cfg port6 = { .family = AF_INET6, .local_ip6 = IN6ADDR_ANY_INIT, .use_udp6_tx_checksums = true, .use_udp6_rx_checksums = true, .ipv6_v6only = true }; #endif rcu_read_lock(); net = rcu_dereference(wg->creating_net); net = net ? maybe_get_net(net) : NULL; rcu_read_unlock(); if (unlikely(!net)) return -ENONET; #if IS_ENABLED(CONFIG_IPV6) retry: #endif ret = udp_sock_create(net, &port4, &new4); if (ret < 0) { pr_err("%s: Could not create IPv4 socket\n", wg->dev->name); goto out; } set_sock_opts(new4); setup_udp_tunnel_sock(net, new4, &cfg); #if IS_ENABLED(CONFIG_IPV6) if (ipv6_mod_enabled()) { port6.local_udp_port = inet_sk(new4->sk)->inet_sport; ret = udp_sock_create(net, &port6, &new6); if (ret < 0) { udp_tunnel_sock_release(new4); if (ret == -EADDRINUSE && !port && retries++ < 100) goto retry; pr_err("%s: Could not create IPv6 socket\n", wg->dev->name); goto out; } set_sock_opts(new6); setup_udp_tunnel_sock(net, new6, &cfg); } #endif wg_socket_reinit(wg, new4->sk, new6 ? new6->sk : NULL); ret = 0; out: put_net(net); return ret; } void wg_socket_reinit(struct wg_device *wg, struct sock *new4, struct sock *new6) { struct sock *old4, *old6; mutex_lock(&wg->socket_update_lock); old4 = rcu_dereference_protected(wg->sock4, lockdep_is_held(&wg->socket_update_lock)); old6 = rcu_dereference_protected(wg->sock6, lockdep_is_held(&wg->socket_update_lock)); rcu_assign_pointer(wg->sock4, new4); rcu_assign_pointer(wg->sock6, new6); if (new4) wg->incoming_port = ntohs(inet_sk(new4)->inet_sport); mutex_unlock(&wg->socket_update_lock); synchronize_net(); sock_free(old4); sock_free(old6); } |
16 8 1 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 1 1 1 12 13 13 12 13 13 13 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 18 18 18 18 11 3 3 3 15 15 15 1 15 15 15 15 3 2 3 3 3 3 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 | // SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * * Copyright (C) 2013-2014 Jens Axboe * Copyright (C) 2013-2014 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> #include <linux/sched/isolation.h> #include <trace/events/block.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; if (!sbitmap_test_bit(&hctx->ctx_map, bit)) sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct block_device *part; unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; if (rq->rq_flags & RQF_IO_STAT && (!bdev_is_partition(mi->part) || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } unsigned int blk_mq_in_flight(struct request_queue *q, struct block_device *part) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); inflight[0] = mi.inflight[0]; inflight[1] = mi.inflight[1]; } #ifdef CONFIG_LOCKDEP static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { if (!owner) return false; if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; return true; } if (owner == q->mq_freeze_owner) q->mq_freeze_owner_depth += 1; return false; } /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { if (!q->mq_freeze_owner) return false; if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { q->mq_freeze_owner = NULL; return true; } return false; } #else static bool blk_freeze_set_owner(struct request_queue *q, struct task_struct *owner) { return false; } static bool blk_unfreeze_check_owner(struct request_queue *q) { return false; } #endif bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner) { bool freeze; mutex_lock(&q->mq_freeze_lock); freeze = blk_freeze_set_owner(q, owner); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } else { mutex_unlock(&q->mq_freeze_lock); } return freeze; } void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) blk_freeze_acquire_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) { return wait_event_timeout(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter), timeout); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); void blk_mq_freeze_queue(struct request_queue *q) { blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { bool unfreeze; mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } unfreeze = blk_unfreeze_check_owner(q); mutex_unlock(&q->mq_freeze_lock); return unfreeze; } void blk_mq_unfreeze_queue(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) blk_unfreeze_release_lock(q, false, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible. */ void blk_freeze_queue_start_non_owner(struct request_queue *q) { __blk_freeze_queue_start(q, NULL); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner); /* non_owner variant of blk_mq_unfreeze_queue */ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++) blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { if (set->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(set->srcu); else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue. */ void blk_mq_unquiesce_queue(struct request_queue *q) { unsigned long flags; bool run_queue = false; spin_lock_irqsave(&q->queue_lock, flags); if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { ; } else if (!--q->quiesce_depth) { blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); run_queue = true; } spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ if (run_queue) blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } mutex_unlock(&set->tag_list_lock); blk_mq_wait_quiesce_done(set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_unquiesce_queue(q); } mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) rq->alloc_time_ns = alloc_time_ns; else rq->alloc_time_ns = 0; #endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; rq->part = NULL; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; rq->nr_integrity_segments = 0; rq->end_io = NULL; rq->end_io_data = NULL; blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } return rq; } static inline struct request * __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; struct request *rq; unsigned long tag_mask; int i, nr = 0; tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); if (unlikely(!tag_mask)) return NULL; tags = blk_mq_tags_from_data(data); for (i = 0; tag_mask; i++) { if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add_head(data->cached_rqs, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rqs); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue. */ data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the * dispatch list. */ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); data->rq_flags |= RQF_USE_SCHED; if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } } else { blk_mq_tag_busy(data->hctx); } if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; /* * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } data->nr_tags = 1; } /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. */ tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away. */ msleep(3); goto retry; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_plug *plug, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = plug->nr_ios, .cached_rqs = &plug->cached_rqs, }; struct request *rq; if (blk_queue_enter(q, flags)) return NULL; plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) blk_queue_exit(q); return rq; } static struct request *blk_mq_alloc_cached_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_plug *plug = current->plug; struct request *rq; if (!plug) return NULL; if (rq_list_empty(&plug->cached_rqs)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; rq_list_pop(&plug->cached_rqs); blk_mq_rq_time_init(rq, blk_time_get_ns()); } rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_cached_request(q, opf, flags); if (!rq) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = 1, }; int ret; ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = 1, }; u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; unsigned int tag; int ret; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); /* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ ret = -EXDEV; data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; blk_zone_finish_request(rq); if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq. */ rq->rq_flags &= ~RQF_USE_SCHED; } } static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != BLK_MQ_NO_TAG) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL) blk_mq_free_request(rq); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->rq_flags & RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } static void blk_print_req_error(struct request *req, blk_status_t status) { printk_ratelimited(KERN_ERR "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), (__force u32)req_op(req), blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req_get_ioprio(req))); } /* * Fully end IO on a request. Does not support partial completions, or * errors. */ static void blk_complete_request(struct request *req) { const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio; trace_block_rq_complete(req, BLK_STS_OK, total_bytes); if (!bio) return; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) blk_integrity_complete(req, total_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ blk_crypto_rq_put_keyslot(req); blk_account_io_completion(req, total_bytes); do { struct bio *next = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); blk_zone_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); bio = next; } while (bio); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ if (!req->end_io) { req->bio = NULL; req->__data_len = 0; } } /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) blk_integrity_complete(req, nr_bytes); /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (unlikely(error)) bio->bi_status = error; if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially. */ bio->bi_status = BLK_STS_IOERR; } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, bio_bytes); /* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { blk_zone_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); } total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { trace_block_io_done(req); /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_local_dec(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } static inline bool blk_rq_passthrough_stats(struct request *req) { struct bio *bio = req->bio; if (!blk_queue_passthrough_stat(req->q)) return false; /* Requests without a bio do not transfer data. */ if (!bio) return false; /* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it. */ if (!bio->bi_bdev) return false; /* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access. */ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) return false; return true; } static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); if (!blk_queue_io_stat(req->q)) return; if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req)) return; req->rq_flags |= RQF_IO_STAT; req->start_time_ns = blk_time_get_ns(); /* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio. */ if (req->bio) req->part = req->bio->bi_bdev; else req->part = req->q->disk->part0; part_stat_lock(); update_io_ticks(req->part, jiffies, false); part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); } inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } } EXPORT_SYMBOL(__blk_mq_end_request); void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); } EXPORT_SYMBOL(blk_mq_end_request); #define TAG_COMP_BATCH 32 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, int *tag_array, int nr_tags) { struct request_queue *q = hctx->queue; blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; if (iob->need_ts) now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); prefetch(rq->rq_next); blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); blk_mq_finish_request(rq); rq_qos_done(rq->q, rq); /* * If end_io handler returns NONE, then it still has * ownership of the request. */ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; } if (nr_tags) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); struct request *rq, *next; llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); } static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } static void __blk_mq_complete_request_remote(void *data) { __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain. */ if (force_irqthreads()) return false; /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && cpus_share_cache(cpu, rq->mq_ctx->cpu) && cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu); } static void blk_mq_complete_send_ipi(struct request *rq) { unsigned int cpu; cpu = rq->mq_ctx->cpu; if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) { struct llist_head *list; preempt_disable(); list = this_cpu_ptr(&blk_cpu_done); if (llist_add(&rq->ipi_list, list)) raise_softirq(BLOCK_SOFTIRQ); preempt_enable(); } bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion. */ if ((rq->mq_hctx->nr_ctx == 1 && rq->mq_ctx->cpu == raw_smp_processor_id()) || rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation. **/ void blk_mq_complete_request(struct request *rq) { if (!blk_mq_complete_request_remote(rq)) rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer. */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) blk_integrity_prepare(rq); if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list); if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || (!blk_queue_nomerges(rq->q) && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_mq_flush_plug_list(plug, false); last = NULL; trace_block_plug(rq->q); } if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq_list_add_tail(&plug->mq_list, rq); plug->rq_count++; } /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); struct blk_rq_wait { struct completion done; blk_status_t ret; }; static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } /** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); else blk_wait_io(&wait.done); return wait.ret; } EXPORT_SYMBOL(blk_execute_rq); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); LIST_HEAD(flush_list); struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) blk_mq_request_bypass_insert(rq, 0); else blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } while (!list_empty(&flush_list)) { rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); static bool blk_is_flush_data_rq(struct request *rq) { return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); } static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce */ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && blk_is_flush_data_rq(rq) && blk_mq_request_completed(rq))) { bool *busy = priv; *busy = true; return false; } return true; } bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } blk_add_timer(req); } struct blk_expired_data { bool has_timedout_rq; unsigned long next; unsigned long timeout_start; }; static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; if (rq->rq_flags & RQF_TIMED_OUT) return false; deadline = READ_ONCE(rq->deadline); if (time_after_eq(expired->timeout_start, deadline)) return true; if (expired->next == 0) expired->next = deadline; else if (time_after(expired->next, deadline)) expired->next = deadline; return false; } void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { if (rq->end_io(rq, 0) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); } } static bool blk_mq_check_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, expired)) { expired->has_timedout_rq = true; return false; } return true; } static bool blk_mq_handle_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); struct blk_expired_data expired = { .timeout_start = jiffies, }; struct blk_mq_hw_ctx *hctx; unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; /* check if there is any timed-out request */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); } if (expired.next != 0) { mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle. */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } blk_queue_exit(q); } struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; struct request *rq; }; static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); return !dispatch_data->rq; } struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, }; __sbitmap_for_each_set(&hctx->ctx_map, off, dispatch_rq_from_ctx, &data); return data.rq; } bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) return false; } tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; rq->tag = tag + tag_offset; blk_mq_inc_active_requests(rq->mq_hctx); return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { struct sbitmap_queue *sbq; list_del_init(&wait->entry); sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); return 1; } /* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ return blk_mq_get_driver_tag(rq); } wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) sbq = &hctx->tags->breserved_tags; else sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); /* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue. */ smp_mb(); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } /* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ list_del_init(&wait->entry); atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return true; } #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially */ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; ewma = hctx->dispatch_busy; if (!ewma && !busy) return; ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; if (busy) ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; hctx->dispatch_busy = ewma; } #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, PREP_DISPATCH_NO_BUDGET, }; static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; int budget_token = -1; if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) { blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET; } blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch */ if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } return PREP_DISPATCH_OK; } /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, struct list_head *list) { struct request *rq; list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq); if (budget_token >= 0) blk_mq_put_dispatch_budget(q, budget_token); } } /* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed */ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule) { if (hctx->queue->mq_ops->commit_rqs && queued) { trace_block_unplug(hctx->queue, queued, !from_schedule); hctx->queue->mq_ops->commit_rqs(hctx); } } /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, unsigned int nr_budgets) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; bool needs_resource = false; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); if (prep != PREP_DISPATCH_OK) break; list_del_init(&rq->queuelist); bd.rq = rq; bd.last = list_empty(list); /* * once the request is queued to lld, no need to cover the * budget any more */ if (nr_budgets) nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: needs_resource = true; fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ if (!list_empty(list) || ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); if (nr_budgets) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here. */ smp_mb(); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET) needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; } blk_mq_update_dispatch_busy(hctx, false); return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(hctx->cpumask); return cpu; } /* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check */ static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) { return hctx->next_cpu >= nr_cpu_ids; } /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items. */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; int next_cpu = hctx->next_cpu; /* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } /* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; } /* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } hctx->next_cpu = next_cpu; return next_cpu; } /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx) { bool need_run; /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); return need_run; } /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware. */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { bool need_run; /* * We can't run the queue inline with interrupts disabled. */ WARN_ON_ONCE(!async && in_interrupt()); might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsigned long flags; /* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue. */ spin_lock_irqsave(&hctx->queue->queue_lock, flags); need_run = blk_mq_hw_queue_need_run(hctx); spin_unlock_irqrestore(&hctx->queue->queue_lock, flags); if (!need_run) return; } if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing. */ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; } /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes. */ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_start_hw_queues); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (!blk_mq_hctx_stopped(hctx)) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); /* * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch * list in the subsequent routine. */ smp_mb__after_atomic(); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async || (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); } static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT) run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); out: blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); } else if (q->elevator) { LIST_HEAD(list); WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); list_add(&rq->queuelist, &list); q->elevator->type->ops.insert_requests(hctx, &list, flags); } else { trace_block_rq_insert(rq); spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { int err; if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; blk_rq_bio_prep(rq, bio, nr_segs); if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_status_t ret; /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: blk_mq_update_dispatch_busy(hctx, false); break; } return ret; } static bool blk_mq_get_budget_and_tag(struct request *rq) { int budget_token; budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) return false; blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(rq->q, budget_token); return false; } return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; } if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } ret = __blk_mq_issue_directly(hctx, rq, true); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); break; default: blk_mq_end_request(rq, ret); break; } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return BLK_STS_OK; } if (!blk_mq_get_budget_and_tag(rq)) return BLK_STS_RESOURCE; return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_plug_issue_direct(struct blk_plug *plug) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { bool last = rq_list_empty(&plug->mq_list); if (hctx != rq->mq_hctx) { if (hctx) { blk_mq_commit_rqs(hctx, queued, false); queued = 0; } hctx = rq->mq_hctx; } ret = blk_mq_request_issue_directly(rq, last); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_plug_list(struct request_queue *q, struct blk_plug *plug) { if (blk_queue_quiesced(q)) return; q->mq_ops->queue_rqs(&plug->mq_list); } static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; struct rq_list requeue_list = {}; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { struct request *rq = rq_list_pop(&plug->mq_list); if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_list, rq); continue; } list_add_tail(&rq->queuelist, &list); depth++; } while (!rq_list_empty(&plug->mq_list)); plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); blk_mq_run_hw_queue(this_hctx, from_sched); } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); } else { blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); } percpu_ref_put(&this_hctx->queue->q_usage_counter); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; unsigned int depth; /* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty. */ if (plug->rq_count == 0) return; depth = plug->rq_count; plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { struct request_queue *q; rq = rq_list_peek(&plug->mq_list); q = rq->q; trace_block_unplug(q, depth, true); /* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole plug list in one go. We * already know at this point that all requests belong to the * same queue, caller must ensure that's the case. */ if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); if (rq_list_empty(&plug->mq_list)) return; } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug)); if (rq_list_empty(&plug->mq_list)) return; } do { blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(&plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); if (list_empty(list)) blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; } return false; } static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio, unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, .nr_tags = 1, .cmd_flags = bio->bi_opf, }; struct request *rq; rq_qos_throttle(q, bio); if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; data.cached_rqs = &plug->cached_rqs; } rq = __blk_mq_alloc_requests(&data); if (rq) return rq; rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); return NULL; } /* * Check if there is a suitable cached request and return it. */ static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf) { enum hctx_type type = blk_mq_get_hctx_type(opf); struct request *rq; if (!plug) return NULL; rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type && (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; return rq; } static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { if (rq_list_pop(&plug->cached_rqs) != rq) WARN_ON_ONCE(1); /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ rq_qos_throttle(rq->q, bio); blk_mq_rq_time_init(rq, blk_time_get_ns()); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); } static bool bio_unaligned(const struct bio *bio, struct request_queue *q) { unsigned int bs_mask = queue_logical_block_size(q) - 1; /* .bi_sector of any zero sized bio need to be initialized */ if ((bio->bi_iter.bi_size & bs_mask) || ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) return true; return false; } /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation. */ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; blk_status_t ret; /* * If the plug has a cached request for this queue, try to use it. */ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); /* * A BIO that was released from a zone write plug has already been * through the preparation in this function, already holds a reference * on the queue usage counter, and is the only write BIO in-flight for * the target zone. Go straight to preparing a request for it. */ if (bio_zone_write_plugging(bio)) { nr_segs = bio->__bi_nr_segments; if (rq) blk_queue_exit(q); goto new_request; } bio = blk_queue_bounce(bio, q); /* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ if (!rq) { if (unlikely(bio_queue_enter(bio))) return; } /* * Device reconfiguration may change logical block size, so alignment * check has to be done with queue usage counter held */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; if (!bio_integrity_prep(bio)) goto queue_exit; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) goto queue_exit; new_request: if (!rq) { rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) goto queue_exit; } else { blk_mq_use_cached_rq(rq, plug, bio); } trace_block_getrq(bio); rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); return; } if (bio_zone_write_plugging(bio)) blk_zone_write_plug_init_request(rq); if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; if (plug) { blk_add_rq_to_plug(plug, rq); return; } hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } return; queue_exit: /* * Don't drop the queue reference if we were trying to use a cached * request and thus didn't acquire one. */ if (!rq) blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * The queue settings related to segment counting may differ from the * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) { printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret; blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) { bio_put(bio); goto free_and_out; } if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); #endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags. */ if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } } /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&drv_tags->lock, flags); spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; struct page *page; if (list_empty(&tags->page_list)) return; if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else drv_tags = set->tags[hctx_idx]; if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { struct request *rq = tags->static_rqs[i]; if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); tags->static_rqs[i] = NULL; } } blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } } void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; blk_mq_free_tags(tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, unsigned int hctx_idx) { int i; for (i = 0; i < set->nr_maps; i++) { unsigned int start = set->map[i].queue_offset; unsigned int end = start + set->map[i].nr_queues; if (hctx_idx >= start && hctx_idx < end) break; } if (i >= set->nr_maps) i = HCTX_TYPE_DEFAULT; return i; } static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, unsigned int hctx_idx) { enum hctx_type type = hctx_idx_to_type(set, hctx_idx); return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); } static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) goto err_free_rqs; return tags; err_free_rqs: kfree(tags->rqs); err_free_tags: blk_mq_free_tags(tags); return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { int ret; if (set->ops->init_request) { ret = set->ops->init_request(set, rq, hctx_idx, node); if (ret) return ret; } WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; if (node == NUMA_NO_NODE) node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; } struct rq_iter_data { struct blk_mq_hw_ctx *hctx; bool has_rq; }; static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; if (rq->mq_hctx != iter_data->hctx) return true; iter_data->has_rq = true; return false; } static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; struct rq_iter_data data = { .hctx = hctx, }; blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); return data.has_rq; } static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsigned int this_cpu) { enum hctx_type type = hctx->type; int cpu; /* * hctx->cpumask has to rule out isolated CPUs, but userspace still * might submit IOs on these isolated CPUs, so use the queue map to * check if all CPUs mapped to this hctx are offline */ for_each_online_cpu(cpu) { struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, type, cpu); if (h != hctx) continue; /* this hctx has at least one online CPU */ if (this_cpu != cpu) return true; } return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator. */ set_bit(BLK_MQ_S_INACTIVE, &hctx->state); smp_mb__after_atomic(); /* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx)) msleep(5); percpu_ref_put(&hctx->queue->q_usage_counter); } return 0; } /* * Check if one CPU is mapped to the specified hctx * * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed * to be used for scheduling kworker only. For other usage, please call this * helper for checking if one CPU belongs to the specified hctx */ static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu, const struct blk_mq_hw_ctx *hctx) { struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue, hctx->type, cpu); return mapped_hctx == hctx; } static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (blk_mq_cpu_mapped_to_hctx(cpu, hctx)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); INIT_HLIST_NODE(&hctx->cpuhp_online); } if (!hlist_unhashed(&hctx->cpuhp_dead)) { cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_dead); } } static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) { lockdep_assert_held(&blk_mq_cpuhp_lock); if (!(hctx->flags & BLK_MQ_F_STACKING) && hlist_unhashed(&hctx->cpuhp_online)) cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); if (hlist_unhashed(&hctx->cpuhp_dead)) cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } static void __blk_mq_remove_cpuhp_list(struct list_head *head) { struct blk_mq_hw_ctx *hctx; lockdep_assert_held(&blk_mq_cpuhp_lock); list_for_each_entry(hctx, head, hctx_list) __blk_mq_remove_cpuhp(hctx); } /* * Unregister cpuhp callbacks from exited hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) { LIST_HEAD(hctx_list); spin_lock(&q->unused_hctx_lock); list_splice_init(&q->unused_hctx_list, &hctx_list); spin_unlock(&q->unused_hctx_lock); mutex_lock(&blk_mq_cpuhp_lock); __blk_mq_remove_cpuhp_list(&hctx_list); mutex_unlock(&blk_mq_cpuhp_lock); spin_lock(&q->unused_hctx_lock); list_splice(&hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } /* * Register cpuhp callbacks from all hw queues * * Safe to call if this `request_queue` is live */ static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&blk_mq_cpuhp_lock); queue_for_each_hw_ctx(q, hctx, i) __blk_mq_add_cpuhp(hctx); mutex_unlock(&blk_mq_cpuhp_lock); } /* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF. */ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) return; WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&tags->lock, flags); spin_unlock_irqrestore(&tags->lock, flags); } /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); if (blk_queue_init_done(q)) blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { hctx->queue_num = hctx_idx; hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto fail; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; return 0; exit_flush_rq: if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); fail: return -1; } static struct blk_mq_hw_ctx * blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, int node) { struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); INIT_HLIST_NODE(&hctx->cpuhp_dead); INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), gfp, node); if (!hctx->ctxs) goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; blk_mq_hctx_kobj_init(hctx); return hctx; free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: free_cpumask_var(hctx->cpumask); free_hctx: kfree(hctx); fail_alloc_hctx: return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = cpu_to_node(i); } } } struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { struct blk_mq_tags *tags; int ret; tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL; ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { blk_mq_free_rq_map(tags); return NULL; } return tags; } static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { if (blk_mq_is_shared_tags(set->flags)) { set->tags[hctx_idx] = set->shared_tags; return true; } set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth); return set->tags[hctx_idx]; } void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); blk_mq_free_rq_map(tags); } } static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx) { if (!blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int j, hctx_idx; unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; hctx->dispatch_from = NULL; } /* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) { ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); continue; } hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ set->map[j].mq_map[i] = 0; } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps. */ if (cpumask_test_cpu(i, hctx->cpumask)) continue; cpumask_set_cpu(i, hctx->cpumask); hctx->type = j; ctx->index_hw[hctx->type] = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; /* * If the nr_ctx type overflows, we have exceeded the * amount of sw queues we can support. */ BUG_ON(!hctx->nr_ctx); } for (; j < HCTX_MAX_TYPES; j++) ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { int cpu; /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation */ if (i) __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; } hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Rule out isolated CPUs from hctx->cpumask to avoid * running block kworker on isolated CPUs */ for_each_cpu(cpu, hctx->cpumask) { if (cpu_is_isolated(cpu)) cpumask_clear_cpu(cpu, hctx->cpumask); } /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } /* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet. */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } } static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, true); } if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } /* All allocations will be freed in release handler of q->mq_kobj */ static int blk_mq_alloc_ctxs(struct request_queue *q) { struct blk_mq_ctxs *ctxs; int cpu; ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM; ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail; for_each_possible_cpu(cpu) { struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); ctx->ctxs = ctxs; } q->mq_kobj = &ctxs->kobj; q->queue_ctx = ctxs->queue_ctx; return 0; fail: kfree(ctxs); return -ENOMEM; } /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it. */ void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); /* all hctx are in .unused_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } static bool blk_mq_can_poll(struct blk_mq_tag_set *set) { return set->nr_maps > HCTX_TYPE_POLL && set->map[HCTX_TYPE_POLL].nr_queues; } struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { struct queue_limits default_lim = { }; struct request_queue *q; int ret; if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (blk_mq_can_poll(set)) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { blk_put_queue(q); return ERR_PTR(ret); } return q; } EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ void blk_mq_destroy_queue(struct request_queue *q) { WARN_ON_ONCE(!queue_is_mq(q)); WARN_ON_ONCE(blk_queue_registered(q)); might_sleep(); blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass) { struct gendisk *disk; if (!blk_get_queue(q)) return NULL; disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk) blk_put_queue(q); return disk; } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); /* * Only hctx removed from cpuhp list can be reused */ static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx) { return hlist_unhashed(&hctx->cpuhp_online) && hlist_unhashed(&hctx->cpuhp_dead); } static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { struct blk_mq_hw_ctx *hctx = NULL, *tmp; /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) { hctx = tmp; break; } } if (hctx) list_del_init(&hctx->hctx_list); spin_unlock(&q->unused_hctx_lock); if (!hctx) hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx; return hctx; free_hctx: kobject_put(&hctx->kobj); fail: return NULL; } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); WARN_ON_ONCE(!hctx); } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; } else { j = i; q->nr_hw_queues = set->nr_hw_queues; } xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); /* unregister cpuhp callbacks for exited hctxs */ blk_mq_remove_hw_queues_cpuhp(q); /* register cpuhp for new initialized hctxs */ blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; /* * ->tag_set has to be setup before initialize hctx, which cpuphp * handler needs it for checking queue mapping */ q->tag_set = set; if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); xa_init(&q->hctx_table); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); return 0; err_hctxs: blk_mq_release(q); err_exit: q->mq_ops = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; if (blk_mq_is_shared_tags(set->flags)) { set->shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, set->queue_depth); if (!set->shared_tags) return -ENOMEM; } for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } return 0; out_unwind: while (--i >= 0) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; } static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues. */ if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; if (set->ops->map_queues) { int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!new_tags) return -ENOMEM; if (set->tags) memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues) __blk_mq_free_map_and_rqs(set, i); return -ENOMEM; } cond_resched(); } done: set->nr_hw_queues = new_nr_hw_queues; return 0; } /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; if (!set->ops->queue_rq) return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 64 tags to prevent * using too much memory. */ if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; if (set->flags & BLK_MQ_F_BLOCKING) { set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM; ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu; } ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out_cleanup_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); out_free_srcu: if (set->flags & BLK_MQ_F_BLOCKING) kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); /* allocate and initialize a tagset for a simple single-queue device */ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; } kfree(set->tags); set->tags = NULL; if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); } } EXPORT_SYMBOL(blk_mq_free_tag_set); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int ret; unsigned long i; if (WARN_ON_ONCE(!q->mq_freeze_depth)) return -EINVAL; if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; /* * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } if (!ret) { q->nr_requests = nr; if (blk_mq_is_shared_tags(set->flags)) { if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); } } blk_mq_unquiesce_queue(q); return ret; } /* * request_queue and elevator_type pair. * It is just used by __blk_mq_update_nr_hw_queues to cache * the elevator_type associated with a request_queue. */ struct blk_mq_qe_pair { struct list_head node; struct request_queue *q; struct elevator_type *type; }; /* * Cache the elevator_type in qe pair list and switch the * io scheduler to 'none' */ static bool blk_mq_elv_switch_none(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!qe) return false; /* q->elevator needs protection from ->sysfs_lock */ mutex_lock(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; } INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; /* keep a reference to the elevator module as we'll switch back */ __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); unlock: mutex_unlock(&q->sysfs_lock); return true; } static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; list_for_each_entry(qe, head, node) if (qe->q == q) return qe; return NULL; } static void blk_mq_elv_switch_back(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; struct elevator_type *t; qe = blk_lookup_qe_pair(head, q); if (!qe) return; t = qe->type; list_del(&qe->node); kfree(qe); mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; int i; lockdep_assert_held(&set->tag_list_lock); if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) if (!blk_mq_elv_switch_none(&head, q)) goto switch_back; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { struct queue_limits lim; blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } lim = queue_limits_start_update(q); if (blk_mq_can_poll(set)) lim.features |= BLK_FEAT_POLL; else lim.features &= ~BLK_FEAT_POLL; if (queue_limits_commit_update(q, &lim) < 0) pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); } switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags) { long state = get_current_state(); int ret; do { ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); if (task_is_running(current)) return 1; if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); __set_current_state(TASK_RUNNING); return 0; } int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); return blk_hctx_poll(q, hctx, iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags) { struct request_queue *q = rq->q; int ret; if (!blk_rq_is_poll(rq)) return 0; if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; } EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); for_each_possible_cpu(i) INIT_CSD(&per_cpu(blk_cpu_csd, i), __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, "block/softirq:dead", NULL, blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", blk_mq_hctx_notify_online, blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init); |
3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> #include <kunit/static_stub.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned base_clusters, num_clusters; int block_cluster = -1, inode_cluster; int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ base_clusters = ext4_num_base_meta_clusters(sb, block_group); num_clusters = base_clusters; /* * Account and record inode table clusters if any cluster * is in the block group, or inode table cluster range is * [-1, -1] and won't overlap with block/inode bitmap cluster * accounted below. */ itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { itbl_blk_start = max(itbl_blk_start, start); itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); num_clusters += itbl_cluster_end - itbl_cluster_start + 1; /* check if border cluster is overlapped */ if (itbl_cluster_start == base_clusters - 1) num_clusters--; } /* * For the allocation bitmaps, we first need to check to see * if the block is in the block group. If it is, then check * to see if the cluster is already accounted for in the clusters * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster >= base_clusters && (block_cluster < itbl_cluster_start || block_cluster > itbl_cluster_end)) num_clusters++; } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); /* * Additional check if inode bitmap is in just accounted * block_cluster */ if (inode_cluster != block_cluster && inode_cluster >= base_clusters && (inode_cluster < itbl_cluster_start || inode_cluster > itbl_cluster_end)) num_clusters++; } return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; ASSERT(buffer_locked(bh)); if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, sb, block_group, bh); if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { ext4_grpblk_t next_zero_bit; unsigned long bitmap_size = sb->s_blocksize * 8; unsigned int offset = num_clusters_in_group(sb, block_group); if (bitmap_size <= offset) return 0; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); return (next_zero_bit < bitmap_size ? next_zero_bit : 0); } struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { struct ext4_group_info **grp_info; long indexv, indexh; if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) return NULL; indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); return grp_info[indexh]; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * @ignore_locked: ignore locked buffers * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, sb, block_group, ignore_locked); desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (ignore_locked && buffer_locked(bh)) { /* buffer under IO already, return if called for prefetching */ put_bh(bh); return NULL; } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); if (err) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), ext4_end_bitmap_read, ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, sb, block_group, bh); if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() - check if a block allocation should be retried * @sb: superblock * @retries: number of retry attempts made so far * * ext4_should_retry_alloc() is called when ENOSPC is returned while * attempting to allocate blocks. If there's an indication that a pending * journal transaction might free some space and allow another attempt to * succeed, this function will wait for the current or committing transaction * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (!sbi->s_journal) return 0; if (++(*retries) > 3) { percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; } /* * if there's no indication that blocks are about to be freed it's * possible we just missed a transaction commit that did so */ smp_mb(); if (sbi->s_mb_free_pending == 0) { if (test_opt(sb, DISCARD)) { atomic_inc(&sbi->s_retry_alloc_pending); flush_work(&sbi->s_discard_work); atomic_dec(&sbi->s_retry_alloc_pending); } return ext4_has_free_clusters(sbi, 1, 0); } /* * it's possible we've just missed a transaction commit here, * so ignore the returned status */ ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } return num; } static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (task_pid_nr(current) % 16) * ((last_block - bg_start) / 16); return bg_start + colour; } |
3626 3 3236 3237 22 3233 696 3257 809 815 814 825 77 77 75 77 73 3 722 712 22 1 720 609 728 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Percpu refcounts: * (C) 2012 Google, Inc. * Author: Kent Overstreet <koverstreet@google.com> * * This implements a refcount with similar semantics to atomic_t - atomic_inc(), * atomic_dec_and_test() - but percpu. * * There's one important difference between percpu refs and normal atomic_t * refcounts; you have to keep track of your initial refcount, and then when you * start shutting down you call percpu_ref_kill() _before_ dropping the initial * refcount. * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() * puts the ref back in single atomic_t mode, collecting the per cpu refs and * issuing the appropriate barriers, and then marks the ref as shutting down so * that percpu_ref_put() will check for the ref hitting 0. After it returns, * it's safe to drop the initial ref. * * USAGE: * * See fs/aio.c for some example usage; it's used there for struct kioctx, which * is created when userspaces calls io_setup(), and destroyed when userspace * calls io_destroy() or the process exits. * * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref. * After that, there can't be any new users of the kioctx (from lookup_ioctx()) * and it's then safe to drop the initial ref with percpu_ref_put(). * * Note that the free path, free_ioctx(), needs to go through explicit call_rcu() * to synchronize with RCU protected lookup_ioctx(). percpu_ref operations don't * imply RCU grace periods of any kind and if a user wants to combine percpu_ref * with RCU protection, it must be done explicitly. * * Code that does a two stage shutdown like this often needs some kind of * explicit synchronization to ensure the initial refcount can only be dropped * once - percpu_ref_kill() does this for you, it returns true once and false if * someone else already called it. The aio code uses it this way, but it's not * necessary if the code has some other mechanism to synchronize teardown. * around. */ #ifndef _LINUX_PERCPU_REFCOUNT_H #define _LINUX_PERCPU_REFCOUNT_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/gfp.h> struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ enum { __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, __PERCPU_REF_FLAG_BITS = 2, }; /* @flags for percpu_ref_init() */ enum { /* * Start w/ ref == 1 in atomic mode. Can be switched to percpu * operation using percpu_ref_switch_to_percpu(). If initialized * with this flag, the ref will stay in atomic mode until * percpu_ref_switch_to_percpu() is invoked on it. * Implies ALLOW_REINIT. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, /* * Start dead w/ ref == 0 in atomic mode. Must be revived with * percpu_ref_reinit() before used. Implies INIT_ATOMIC and * ALLOW_REINIT. */ PERCPU_REF_INIT_DEAD = 1 << 1, /* * Allow switching from atomic mode to percpu mode. */ PERCPU_REF_ALLOW_REINIT = 1 << 2, }; struct percpu_ref_data { atomic_long_t count; percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; bool allow_reinit:1; struct rcu_head rcu; struct percpu_ref *ref; }; struct percpu_ref { /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ unsigned long percpu_count_ptr; /* * 'percpu_ref' is often embedded into user structure, and only * 'percpu_count_ptr' is required in fast path, move other fields * into 'percpu_ref_data', so we can reduce memory footprint in * fast path. */ struct percpu_ref_data *data; }; int __must_check percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, unsigned int flags, gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); void percpu_ref_resurrect(struct percpu_ref *ref); void percpu_ref_reinit(struct percpu_ref *ref); bool percpu_ref_is_zero(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref * @ref: percpu_ref to kill * * Must be used to drop the initial ref on a percpu refcount; must be called * precisely once before shutdown. * * Switches @ref into atomic mode before gathering up the percpu counters * and dropping the initial ref. * * There are no implied RCU grace periods between kill and release. */ static inline void percpu_ref_kill(struct percpu_ref *ref) { percpu_ref_kill_and_confirm(ref, NULL); } /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional * branches as it can't assume that @ref->percpu_count is not NULL. */ static inline bool __ref_is_percpu(struct percpu_ref *ref, unsigned long __percpu **percpu_countp) { unsigned long percpu_ptr; /* * The value of @ref->percpu_count_ptr is tested for * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then * used as a pointer. If the compiler generates a separate fetch * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); /* * Theoretically, the following could test just ATOMIC; however, * then we'd have to mask off DEAD separately as DEAD may be * visible without ATOMIC if we race with percpu_ref_kill(). DEAD * implies ATOMIC anyway. Test them together. */ if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) return false; *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } /** * percpu_ref_get_many - increment a percpu refcount * @ref: percpu_ref to get * @nr: number of references to get * * Analogous to atomic_long_add(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_add(*percpu_count, nr); else atomic_long_add(nr, &ref->data->count); rcu_read_unlock(); } /** * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * * Analogous to atomic_long_inc(). * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_get(struct percpu_ref *ref) { percpu_ref_get_many(ref, 1); } /** * percpu_ref_tryget_many - try to increment a percpu refcount * @ref: percpu_ref to try-get * @nr: number of references to get * * Increment a percpu refcount by @nr unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; bool ret; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) { this_cpu_add(*percpu_count, nr); ret = true; } else { ret = atomic_long_add_unless(&ref->data->count, nr, 0); } rcu_read_unlock(); return ret; } /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { return percpu_ref_tryget_many(ref, 1); } /** * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the * caller is responsible for taking RCU. * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; bool ret = false; WARN_ON_ONCE(!rcu_read_lock_held()); if (likely(__ref_is_percpu(ref, &percpu_count))) { this_cpu_inc(*percpu_count); ret = true; } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) { ret = atomic_long_inc_not_zero(&ref->data->count); } return ret; } /** * percpu_ref_tryget_live - try to increment a live percpu refcount * @ref: percpu_ref to try-get * * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * * Completion of percpu_ref_kill() in itself doesn't guarantee that this * function will fail. For such guarantee, percpu_ref_kill_and_confirm() * should be used. After the confirm_kill callback is invoked, it's * guaranteed that no new reference will be given out by * percpu_ref_tryget_live(). * * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { bool ret = false; rcu_read_lock(); ret = percpu_ref_tryget_live_rcu(ref); rcu_read_unlock(); return ret; } /** * percpu_ref_put_many - decrement a percpu refcount * @ref: percpu_ref to put * @nr: number of references to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock(); if (__ref_is_percpu(ref, &percpu_count)) this_cpu_sub(*percpu_count, nr); else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count))) ref->data->release(ref); rcu_read_unlock(); } /** * percpu_ref_put - decrement a percpu refcount * @ref: percpu_ref to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { percpu_ref_put_many(ref, 1); } /** * percpu_ref_is_dying - test whether a percpu refcount is dying or dead * @ref: percpu_ref to test * * Returns %true if @ref is dying or dead. * * This function is safe to call as long as @ref is between init and exit * and the caller is responsible for synchronizing against state changes. */ static inline bool percpu_ref_is_dying(struct percpu_ref *ref) { return ref->percpu_count_ptr & __PERCPU_REF_DEAD; } #endif |
60 2 46 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 | // SPDX-License-Identifier: GPL-2.0 /* * file.c - part of debugfs, a tiny little debug file system * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Inc. * * debugfs is for people to use instead of /proc or /sys. * See Documentation/filesystems/ for more details. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> #include "internal.h" struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; #define F_DENTRY(filp) ((filp)->f_path.dentry) const struct file_operations *debugfs_real_fops(const struct file *filp) { struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata; if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT) { /* * Urgh, we've been called w/o a protecting * debugfs_file_get(). */ WARN_ON(1); return NULL; } return fsd->real_fops; } EXPORT_SYMBOL_GPL(debugfs_real_fops); enum dbgfs_get_mode { DBGFS_GET_ALREADY, DBGFS_GET_REGULAR, DBGFS_GET_SHORT, }; static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode) { struct debugfs_fsdata *fsd; void *d_fsd; /* * This could only happen if some debugfs user erroneously calls * debugfs_file_get() on a dentry that isn't even a file, let * them know about it. */ if (WARN_ON(!d_is_reg(dentry))) return -EINVAL; d_fsd = READ_ONCE(dentry->d_fsdata); if (!((unsigned long)d_fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)) { fsd = d_fsd; } else { if (WARN_ON(mode == DBGFS_GET_ALREADY)) return -EINVAL; fsd = kmalloc(sizeof(*fsd), GFP_KERNEL); if (!fsd) return -ENOMEM; if (mode == DBGFS_GET_SHORT) { fsd->real_fops = NULL; fsd->short_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); } else { fsd->real_fops = (void *)((unsigned long)d_fsd & ~DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); fsd->short_fops = NULL; } refcount_set(&fsd->active_users, 1); init_completion(&fsd->active_users_drained); INIT_LIST_HEAD(&fsd->cancellations); mutex_init(&fsd->cancellations_mtx); if (cmpxchg(&dentry->d_fsdata, d_fsd, fsd) != d_fsd) { mutex_destroy(&fsd->cancellations_mtx); kfree(fsd); fsd = READ_ONCE(dentry->d_fsdata); } } /* * In case of a successful cmpxchg() above, this check is * strictly necessary and must follow it, see the comment in * __debugfs_remove_file(). * OTOH, if the cmpxchg() hasn't been executed or wasn't * successful, this serves the purpose of not starving * removers. */ if (d_unlinked(dentry)) return -EIO; if (!refcount_inc_not_zero(&fsd->active_users)) return -EIO; return 0; } /** * debugfs_file_get - mark the beginning of file data access * @dentry: the dentry object whose data is being accessed. * * Up to a matching call to debugfs_file_put(), any successive call * into the file removing functions debugfs_remove() and * debugfs_remove_recursive() will block. Since associated private * file data may only get freed after a successful return of any of * the removal functions, you may safely access it after a successful * call to debugfs_file_get() without worrying about lifetime issues. * * If -%EIO is returned, the file has already been removed and thus, * it is not safe to access any of its data. If, on the other hand, * it is allowed to access the file data, zero is returned. */ int debugfs_file_get(struct dentry *dentry) { return __debugfs_file_get(dentry, DBGFS_GET_ALREADY); } EXPORT_SYMBOL_GPL(debugfs_file_get); /** * debugfs_file_put - mark the end of file data access * @dentry: the dentry object formerly passed to * debugfs_file_get(). * * Allow any ongoing concurrent call into debugfs_remove() or * debugfs_remove_recursive() blocked by a former call to * debugfs_file_get() to proceed and return to its caller. */ void debugfs_file_put(struct dentry *dentry) { struct debugfs_fsdata *fsd = READ_ONCE(dentry->d_fsdata); if (refcount_dec_and_test(&fsd->active_users)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_file_put); /** * debugfs_enter_cancellation - enter a debugfs cancellation * @file: the file being accessed * @cancellation: the cancellation object, the cancel callback * inside of it must be initialized * * When a debugfs file is removed it needs to wait for all active * operations to complete. However, the operation itself may need * to wait for hardware or completion of some asynchronous process * or similar. As such, it may need to be cancelled to avoid long * waits or even deadlocks. * * This function can be used inside a debugfs handler that may * need to be cancelled. As soon as this function is called, the * cancellation's 'cancel' callback may be called, at which point * the caller should proceed to call debugfs_leave_cancellation() * and leave the debugfs handler function as soon as possible. * Note that the 'cancel' callback is only ever called in the * context of some kind of debugfs_remove(). * * This function must be paired with debugfs_leave_cancellation(). */ void debugfs_enter_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); INIT_LIST_HEAD(&cancellation->list); if (WARN_ON(!d_is_reg(dentry))) return; if (WARN_ON(!cancellation->cancel)) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); list_add(&cancellation->list, &fsd->cancellations); mutex_unlock(&fsd->cancellations_mtx); /* if we're already removing wake it up to cancel */ if (d_unlinked(dentry)) complete(&fsd->active_users_drained); } EXPORT_SYMBOL_GPL(debugfs_enter_cancellation); /** * debugfs_leave_cancellation - leave cancellation section * @file: the file being accessed * @cancellation: the cancellation previously registered with * debugfs_enter_cancellation() * * See the documentation of debugfs_enter_cancellation(). */ void debugfs_leave_cancellation(struct file *file, struct debugfs_cancellation *cancellation) { struct debugfs_fsdata *fsd; struct dentry *dentry = F_DENTRY(file); if (WARN_ON(!d_is_reg(dentry))) return; fsd = READ_ONCE(dentry->d_fsdata); if (WARN_ON(!fsd || ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT))) return; mutex_lock(&fsd->cancellations_mtx); if (!list_empty(&cancellation->list)) list_del(&cancellation->list); mutex_unlock(&fsd->cancellations_mtx); } EXPORT_SYMBOL_GPL(debugfs_leave_cancellation); /* * Only permit access to world-readable files when the kernel is locked down. * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ static int debugfs_locked_down(struct inode *inode, struct file *filp, const struct file_operations *real_fops) { if ((inode->i_mode & 07777 & ~0444) == 0 && !(filp->f_mode & FMODE_WRITE) && (!real_fops || (!real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap))) return 0; if (security_locked_down(LOCKDOWN_DEBUGFS)) return -EPERM; return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = NULL; int r; r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR); if (r) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not clean up after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } replace_fops(filp, real_fops); if (real_fops->open) r = real_fops->open(inode, filp); out: debugfs_file_put(dentry); return r; } const struct file_operations debugfs_open_proxy_file_operations = { .open = open_proxy_open, }; #define PROTO(args...) args #define ARGS(args...) args #define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ const struct file_operations *real_fops; \ ret_type r; \ \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ real_fops = debugfs_real_fops(filp); \ r = real_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } #define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args) \ static ret_type full_proxy_ ## name(proto) \ { \ struct dentry *dentry = F_DENTRY(filp); \ struct debugfs_fsdata *fsd; \ ret_type r; \ \ r = debugfs_file_get(dentry); \ if (unlikely(r)) \ return r; \ fsd = dentry->d_fsdata; \ if (fsd->real_fops) \ r = fsd->real_fops->name(args); \ else \ r = fsd->short_fops->name(args); \ debugfs_file_put(dentry); \ return r; \ } FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp, PROTO(struct file *filp, loff_t offset, int whence), ARGS(filp, offset, whence)); FULL_PROXY_FUNC_BOTH(read, ssize_t, filp, PROTO(struct file *filp, char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC_BOTH(write, ssize_t, filp, PROTO(struct file *filp, const char __user *buf, size_t size, loff_t *ppos), ARGS(filp, buf, size, ppos)); FULL_PROXY_FUNC(unlocked_ioctl, long, filp, PROTO(struct file *filp, unsigned int cmd, unsigned long arg), ARGS(filp, cmd, arg)); static __poll_t full_proxy_poll(struct file *filp, struct poll_table_struct *wait) { struct dentry *dentry = F_DENTRY(filp); __poll_t r = 0; const struct file_operations *real_fops; if (debugfs_file_get(dentry)) return EPOLLHUP; real_fops = debugfs_real_fops(filp); r = real_fops->poll(filp, wait); debugfs_file_put(dentry); return r; } static int full_proxy_release(struct inode *inode, struct file *filp) { const struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops = debugfs_real_fops(filp); const struct file_operations *proxy_fops = filp->f_op; int r = 0; /* * We must not protect this against removal races here: the * original releaser should be called unconditionally in order * not to leak any resources. Releasers must not assume that * ->i_private is still being meaningful here. */ if (real_fops && real_fops->release) r = real_fops->release(inode, filp); replace_fops(filp, d_inode(dentry)->i_fop); kfree(proxy_fops); fops_put(real_fops); return r; } static void __full_proxy_fops_init(struct file_operations *proxy_fops, struct debugfs_fsdata *fsd) { proxy_fops->release = full_proxy_release; if ((fsd->real_fops && fsd->real_fops->llseek) || (fsd->short_fops && fsd->short_fops->llseek)) proxy_fops->llseek = full_proxy_llseek; if ((fsd->real_fops && fsd->real_fops->read) || (fsd->short_fops && fsd->short_fops->read)) proxy_fops->read = full_proxy_read; if ((fsd->real_fops && fsd->real_fops->write) || (fsd->short_fops && fsd->short_fops->write)) proxy_fops->write = full_proxy_write; if (fsd->real_fops && fsd->real_fops->poll) proxy_fops->poll = full_proxy_poll; if (fsd->real_fops && fsd->real_fops->unlocked_ioctl) proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; } static int full_proxy_open(struct inode *inode, struct file *filp, enum dbgfs_get_mode mode) { struct dentry *dentry = F_DENTRY(filp); const struct file_operations *real_fops; struct file_operations *proxy_fops = NULL; struct debugfs_fsdata *fsd; int r; r = __debugfs_file_get(dentry, mode); if (r) return r == -EIO ? -ENOENT : r; fsd = dentry->d_fsdata; real_fops = fsd->real_fops; r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; if (real_fops && !fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && real_fops->owner->state == MODULE_STATE_GOING) { r = -ENXIO; goto out; } #endif /* Huh? Module did not cleanup after itself at exit? */ WARN(1, "debugfs file owner did not clean up at exit: %pd", dentry); r = -ENXIO; goto out; } proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); if (!proxy_fops) { r = -ENOMEM; goto free_proxy; } __full_proxy_fops_init(proxy_fops, fsd); replace_fops(filp, proxy_fops); if (!real_fops || real_fops->open) { if (real_fops) r = real_fops->open(inode, filp); else r = simple_open(inode, filp); if (r) { replace_fops(filp, d_inode(dentry)->i_fop); goto free_proxy; } else if (filp->f_op != proxy_fops) { /* No protection against file removal anymore. */ WARN(1, "debugfs file owner replaced proxy fops: %pd", dentry); goto free_proxy; } } goto out; free_proxy: kfree(proxy_fops); fops_put(real_fops); out: debugfs_file_put(dentry); return r; } static int full_proxy_open_regular(struct inode *inode, struct file *filp) { return full_proxy_open(inode, filp, DBGFS_GET_REGULAR); } const struct file_operations debugfs_full_proxy_file_operations = { .open = full_proxy_open_regular, }; static int full_proxy_open_short(struct inode *inode, struct file *filp) { return full_proxy_open(inode, filp, DBGFS_GET_SHORT); } const struct file_operations debugfs_full_short_proxy_file_operations = { .open = full_proxy_open_short, }; ssize_t debugfs_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; ret = simple_attr_read(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } EXPORT_SYMBOL_GPL(debugfs_attr_read); static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct dentry *dentry = F_DENTRY(file); ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; if (is_signed) ret = simple_attr_write_signed(file, buf, len, ppos); else ret = simple_attr_write(file, buf, len, ppos); debugfs_file_put(dentry); return ret; } ssize_t debugfs_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(debugfs_attr_write); ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return debugfs_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(debugfs_attr_write_signed); static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, struct dentry *parent, void *value, const struct file_operations *fops, const struct file_operations *fops_ro, const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) return debugfs_create_file_unsafe(name, mode, parent, value, fops_wo); return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; return 0; } static int debugfs_u8_get(void *data, u64 *val) { *val = *(u8 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); static int debugfs_u16_set(void *data, u64 val) { *(u16 *)data = val; return 0; } static int debugfs_u16_get(void *data, u64 *val) { *val = *(u16 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); static int debugfs_u32_set(void *data, u64 val) { *(u32 *)data = val; return 0; } static int debugfs_u32_get(void *data, u64 *val) { *val = *(u32 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); static int debugfs_u64_set(void *data, u64 val) { *(u64 *)data = val; return 0; } static int debugfs_u64_get(void *data, u64 *val) { *val = *(u64 *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); static int debugfs_ulong_set(void *data, u64 val) { *(unsigned long *)data = val; return 0; } static int debugfs_ulong_get(void *data, u64 *val) { *val = *(unsigned long *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write * an unsigned long value. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_ulong, &fops_ulong_ro, &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value * * These functions are exactly the same as the above functions (but use a hex * output for the decimal challenged). For details look at the above unsigned * decimal functions. */ /** * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); /** * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); /** * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); /** * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); static int debugfs_size_t_set(void *data, u64 val) { *(size_t *)data = val; return 0; } static int debugfs_size_t_get(void *data, u64 *val) { *val = *(size_t *)data; return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, "%llu\n"); /* %llu and %zu are more or less the same */ DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_size_t, &fops_size_t_ro, &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); static int debugfs_atomic_t_set(void *data, u64 val) { atomic_set((atomic_t *)data, val); return 0; } static int debugfs_atomic_t_get(void *data, u64 *val) { *val = atomic_read((atomic_t *)data); return 0; } DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and * write an atomic_t value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. */ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_atomic_t, &fops_atomic_t_ro, &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); r = debugfs_file_get(dentry); if (unlikely(r)) return r; val = *(bool *)file->private_data; debugfs_file_put(dentry); if (val) buf[0] = 'Y'; else buf[0] = 'N'; buf[1] = '\n'; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); r = kstrtobool_from_user(user_buf, count, &bv); if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; *val = bv; debugfs_file_put(dentry); } return count; } EXPORT_SYMBOL_GPL(debugfs_write_file_bool); static const struct file_operations fops_bool = { .read = debugfs_read_file_bool, .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_ro = { .read = debugfs_read_file_bool, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_bool_wo = { .write = debugfs_write_file_bool, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_bool - create a debugfs file that is used to read and write a boolean value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *str, *copy = NULL; int copy_len, len; ssize_t ret; ret = debugfs_file_get(dentry); if (unlikely(ret)) return ret; str = *(char **)file->private_data; len = strlen(str) + 1; copy = kmalloc(len, GFP_KERNEL); if (!copy) { debugfs_file_put(dentry); return -ENOMEM; } copy_len = strscpy(copy, str, len); debugfs_file_put(dentry); if (copy_len < 0) { kfree(copy); return copy_len; } copy[copy_len] = '\n'; ret = simple_read_from_buffer(user_buf, count, ppos, copy, len); kfree(copy); return ret; } EXPORT_SYMBOL_GPL(debugfs_create_str); static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct dentry *dentry = F_DENTRY(file); char *old, *new = NULL; int pos = *ppos; int r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; old = *(char **)file->private_data; /* only allow strict concatenation */ r = -EINVAL; if (pos && pos != strlen(old)) goto error; r = -E2BIG; if (pos + count + 1 > PAGE_SIZE) goto error; r = -ENOMEM; new = kmalloc(pos + count + 1, GFP_KERNEL); if (!new) goto error; if (pos) memcpy(new, old, pos); r = -EFAULT; if (copy_from_user(new + pos, user_buf, count)) goto error; new[pos + count] = '\0'; strim(new); rcu_assign_pointer(*(char __rcu **)file->private_data, new); synchronize_rcu(); kfree(old); debugfs_file_put(dentry); return count; error: kfree(new); debugfs_file_put(dentry); return r; } static const struct file_operations fops_str = { .read = debugfs_read_file_str, .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_ro = { .read = debugfs_read_file_str, .open = simple_open, .llseek = default_llseek, }; static const struct file_operations fops_str_wo = { .write = debugfs_write_file_str, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_str - create a debugfs file that is used to read and write a string value * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @value: a pointer to the variable that the file should read to and write * from. * * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. */ void debugfs_create_str(const char *name, umode_t mode, struct dentry *parent, char **value) { debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, &fops_str_ro, &fops_str_wo); } static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_read_from_buffer(user_buf, count, ppos, blob->data, blob->size); debugfs_file_put(dentry); return r; } static ssize_t write_file_blob(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; struct dentry *dentry = F_DENTRY(file); ssize_t r; r = debugfs_file_get(dentry); if (unlikely(r)) return r; r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf, count); debugfs_file_put(dentry); return r; } static const struct file_operations fops_blob = { .read = read_file_blob, .write = write_file_blob, .open = simple_open, .llseek = default_llseek, }; /** * debugfs_create_blob - create a debugfs file that is used to read and write * a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @blob: a pointer to a struct debugfs_blob_wrapper which contains a pointer * to the blob data and the size of the data. * * This function creates a file in debugfs with the given name that exports * @blob->data as a binary blob. If the @mode variable is so set it can be * read from and written to. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); static size_t u32_format_array(char *buf, size_t bufsize, u32 *array, int array_size) { size_t ret = 0; while (--array_size >= 0) { size_t len; char term = array_size ? ' ' : '\n'; len = snprintf(buf, bufsize, "%u%c", *array++, term); ret += len; buf += len; bufsize -= len; } return ret; } static int u32_array_open(struct inode *inode, struct file *file) { struct debugfs_u32_array *data = inode->i_private; int size, elements = data->n_elements; char *buf; /* * Max size: * - 10 digits + ' '/'\n' = 11 bytes per number * - terminating NUL character */ size = elements*11; buf = kmalloc(size+1, GFP_KERNEL); if (!buf) return -ENOMEM; buf[size] = 0; file->private_data = buf; u32_format_array(buf, size, data->array, data->n_elements); return nonseekable_open(inode, file); } static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { size_t size = strlen(file->private_data); return simple_read_from_buffer(buf, len, ppos, file->private_data, size); } static int u32_array_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } static const struct file_operations u32_array_fops = { .owner = THIS_MODULE, .open = u32_array_open, .release = u32_array_release, .read = u32_array_read, }; /** * debugfs_create_u32_array - create a debugfs file that is used to read u32 * array. * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @array: wrapper struct containing data pointer and size of the array. * * This function creates a file in debugfs with the given name that exports * @array as data. If the @mode variable is so set it can be read from. * Writing is not supported. Seek within the file is also not supported. * Once array is created its size can not be changed. */ void debugfs_create_u32_array(const char *name, umode_t mode, struct dentry *parent, struct debugfs_u32_array *array) { debugfs_create_file_unsafe(name, mode, parent, array, &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); #ifdef CONFIG_HAS_IOMEM /* * The regset32 stuff is used to print 32-bit registers using the * seq_file utilities. We offer printing a register set in an already-opened * sequential file or create a debugfs file that only prints a regset32. */ /** * debugfs_print_regs32 - use seq_print to describe a set of registers * @s: the seq_file structure being used to generate output * @regs: an array if struct debugfs_reg32 structures * @nregs: the length of the above array * @base: the base address to be used in reading the registers * @prefix: a string to be prefixed to every output line * * This function outputs a text block describing the current values of * some 32-bit hardware registers. It is meant to be used within debugfs * files based on seq_file that need to show registers, intermixed with other * information. The prefix argument may be used to specify a leading string, * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, int nregs, void __iomem *base, char *prefix) { int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) seq_printf(s, "%s", prefix); seq_printf(s, "%s = 0x%08x\n", regs->name, readl(base + regs->offset)); if (seq_has_overflowed(s)) break; } } EXPORT_SYMBOL_GPL(debugfs_print_regs32); static int debugfs_regset32_show(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; if (regset->dev) pm_runtime_get_sync(regset->dev); debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); if (regset->dev) pm_runtime_put(regset->dev); return 0; } DEFINE_SHOW_ATTRIBUTE(debugfs_regset32); /** * debugfs_create_regset32 - create a debugfs file that returns register values * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @regset: a pointer to a struct debugfs_regset32, which contains a pointer * to an array of register definitions, the array size and the base * address where the register bank is to be found. * * This function creates a file in debugfs with the given name that reports * the names and values of a set of 32-bit registers. If the @mode variable * is so set it can be read from. Writing is not supported. */ void debugfs_create_regset32(const char *name, umode_t mode, struct dentry *parent, struct debugfs_regset32 *regset) { debugfs_create_file(name, mode, parent, regset, &debugfs_regset32_fops); } EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ struct debugfs_devm_entry { int (*read)(struct seq_file *seq, void *data); struct device *dev; }; static int debugfs_devm_entry_open(struct inode *inode, struct file *f) { struct debugfs_devm_entry *entry = inode->i_private; return single_open(f, entry->read, entry->dev); } static const struct file_operations debugfs_devm_entry_ops = { .owner = THIS_MODULE, .open = debugfs_devm_entry_open, .release = single_release, .read = seq_read, .llseek = seq_lseek }; /** * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. * * @dev: device related to this debugfs file. * @name: name of the debugfs file. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is %NULL, then the * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ void debugfs_create_devm_seqfile(struct device *dev, const char *name, struct dentry *parent, int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) return; entry->read = read_fn; entry->dev = dev; debugfs_create_file(name, S_IRUGO, parent, entry, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); |
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 5 3 3 3 3 3 1 1 1 1 2 2 2 2 2 9 1 4 1 1 1 3 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/arp.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/init.h> #include <net/rose.h> #include <linux/seq_file.h> #include <linux/export.h> static unsigned int rose_neigh_no = 1; static struct rose_node *rose_node_list; static DEFINE_SPINLOCK(rose_node_list_lock); static struct rose_neigh *rose_neigh_list; static DEFINE_SPINLOCK(rose_neigh_list_lock); static struct rose_route *rose_route_list; static DEFINE_SPINLOCK(rose_route_list_lock); struct rose_neigh *rose_loopback_neigh; /* * Add a new route to a node, and in the process add the node and the * neighbour if it is new. */ static int __must_check rose_add_node(struct rose_route_struct *rose_route, struct net_device *dev) { struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; struct rose_neigh *rose_neigh; int i, res = 0; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); rose_node = rose_node_list; while (rose_node != NULL) { if ((rose_node->mask == rose_route->mask) && (rosecmpm(&rose_route->address, &rose_node->address, rose_route->mask) == 0)) break; rose_node = rose_node->next; } if (rose_node != NULL && rose_node->loopback) { res = -EINVAL; goto out; } rose_neigh = rose_neigh_list; while (rose_neigh != NULL) { if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) break; rose_neigh = rose_neigh->next; } if (rose_neigh == NULL) { rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC); if (rose_neigh == NULL) { res = -ENOMEM; goto out; } rose_neigh->callsign = rose_route->neighbour; rose_neigh->digipeat = NULL; rose_neigh->ax25 = NULL; rose_neigh->dev = dev; rose_neigh->count = 0; rose_neigh->use = 0; rose_neigh->dce_mode = 0; rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; skb_queue_head_init(&rose_neigh->queue); timer_setup(&rose_neigh->ftimer, NULL, 0); timer_setup(&rose_neigh->t0timer, NULL, 0); if (rose_route->ndigis != 0) { rose_neigh->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC); if (rose_neigh->digipeat == NULL) { kfree(rose_neigh); res = -ENOMEM; goto out; } rose_neigh->digipeat->ndigi = rose_route->ndigis; rose_neigh->digipeat->lastrepeat = -1; for (i = 0; i < rose_route->ndigis; i++) { rose_neigh->digipeat->calls[i] = rose_route->digipeaters[i]; rose_neigh->digipeat->repeated[i] = 0; } } rose_neigh->next = rose_neigh_list; rose_neigh_list = rose_neigh; } /* * This is a new node to be inserted into the list. Find where it needs * to be inserted into the list, and insert it. We want to be sure * to order the list in descending order of mask size to ensure that * later when we are searching this list the first match will be the * best match. */ if (rose_node == NULL) { rose_tmpn = rose_node_list; rose_tmpp = NULL; while (rose_tmpn != NULL) { if (rose_tmpn->mask > rose_route->mask) { rose_tmpp = rose_tmpn; rose_tmpn = rose_tmpn->next; } else { break; } } /* create new node */ rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC); if (rose_node == NULL) { res = -ENOMEM; goto out; } rose_node->address = rose_route->address; rose_node->mask = rose_route->mask; rose_node->count = 1; rose_node->loopback = 0; rose_node->neighbour[0] = rose_neigh; if (rose_tmpn == NULL) { if (rose_tmpp == NULL) { /* Empty list */ rose_node_list = rose_node; rose_node->next = NULL; } else { rose_tmpp->next = rose_node; rose_node->next = NULL; } } else { if (rose_tmpp == NULL) { /* 1st node */ rose_node->next = rose_node_list; rose_node_list = rose_node; } else { rose_tmpp->next = rose_node; rose_node->next = rose_tmpn; } } rose_neigh->count++; goto out; } /* We have space, slot it in */ if (rose_node->count < 3) { rose_node->neighbour[rose_node->count] = rose_neigh; rose_node->count++; rose_neigh->count++; } out: spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); return res; } /* * Caller is holding rose_node_list_lock. */ static void rose_remove_node(struct rose_node *rose_node) { struct rose_node *s; if ((s = rose_node_list) == rose_node) { rose_node_list = rose_node->next; kfree(rose_node); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_node) { s->next = rose_node->next; kfree(rose_node); return; } s = s->next; } } /* * Caller is holding rose_neigh_list_lock. */ static void rose_remove_neigh(struct rose_neigh *rose_neigh) { struct rose_neigh *s; del_timer_sync(&rose_neigh->ftimer); del_timer_sync(&rose_neigh->t0timer); skb_queue_purge(&rose_neigh->queue); if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; if (rose_neigh->ax25) ax25_cb_put(rose_neigh->ax25); kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; if (rose_neigh->ax25) ax25_cb_put(rose_neigh->ax25); kfree(rose_neigh->digipeat); kfree(rose_neigh); return; } s = s->next; } } /* * Caller is holding rose_route_list_lock. */ static void rose_remove_route(struct rose_route *rose_route) { struct rose_route *s; if (rose_route->neigh1 != NULL) rose_route->neigh1->use--; if (rose_route->neigh2 != NULL) rose_route->neigh2->use--; if ((s = rose_route_list) == rose_route) { rose_route_list = rose_route->next; kfree(rose_route); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_route) { s->next = rose_route->next; kfree(rose_route); return; } s = s->next; } } /* * "Delete" a node. Strictly speaking remove a route to a node. The node * is only deleted if no routes are left to it. */ static int rose_del_node(struct rose_route_struct *rose_route, struct net_device *dev) { struct rose_node *rose_node; struct rose_neigh *rose_neigh; int i, err = 0; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); rose_node = rose_node_list; while (rose_node != NULL) { if ((rose_node->mask == rose_route->mask) && (rosecmpm(&rose_route->address, &rose_node->address, rose_route->mask) == 0)) break; rose_node = rose_node->next; } if (rose_node == NULL || rose_node->loopback) { err = -EINVAL; goto out; } rose_neigh = rose_neigh_list; while (rose_neigh != NULL) { if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 && rose_neigh->dev == dev) break; rose_neigh = rose_neigh->next; } if (rose_neigh == NULL) { err = -EINVAL; goto out; } for (i = 0; i < rose_node->count; i++) { if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; if (rose_neigh->count == 0 && rose_neigh->use == 0) rose_remove_neigh(rose_neigh); rose_node->count--; if (rose_node->count == 0) { rose_remove_node(rose_node); } else { switch (i) { case 0: rose_node->neighbour[0] = rose_node->neighbour[1]; fallthrough; case 1: rose_node->neighbour[1] = rose_node->neighbour[2]; break; case 2: break; } } goto out; } } err = -EINVAL; out: spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); return err; } /* * Add the loopback neighbour. */ void rose_add_loopback_neigh(void) { struct rose_neigh *sn; rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_KERNEL); if (!rose_loopback_neigh) return; sn = rose_loopback_neigh; sn->callsign = null_ax25_address; sn->digipeat = NULL; sn->ax25 = NULL; sn->dev = NULL; sn->count = 0; sn->use = 0; sn->dce_mode = 1; sn->loopback = 1; sn->number = rose_neigh_no++; sn->restarted = 1; skb_queue_head_init(&sn->queue); timer_setup(&sn->ftimer, NULL, 0); timer_setup(&sn->t0timer, NULL, 0); spin_lock_bh(&rose_neigh_list_lock); sn->next = rose_neigh_list; rose_neigh_list = sn; spin_unlock_bh(&rose_neigh_list_lock); } /* * Add a loopback node. */ int rose_add_loopback_node(const rose_address *address) { struct rose_node *rose_node; int err = 0; spin_lock_bh(&rose_node_list_lock); rose_node = rose_node_list; while (rose_node != NULL) { if ((rose_node->mask == 10) && (rosecmpm(address, &rose_node->address, 10) == 0) && rose_node->loopback) break; rose_node = rose_node->next; } if (rose_node != NULL) goto out; if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) { err = -ENOMEM; goto out; } rose_node->address = *address; rose_node->mask = 10; rose_node->count = 1; rose_node->loopback = 1; rose_node->neighbour[0] = rose_loopback_neigh; /* Insert at the head of list. Address is always mask=10 */ rose_node->next = rose_node_list; rose_node_list = rose_node; rose_loopback_neigh->count++; out: spin_unlock_bh(&rose_node_list_lock); return err; } /* * Delete a loopback node. */ void rose_del_loopback_node(const rose_address *address) { struct rose_node *rose_node; spin_lock_bh(&rose_node_list_lock); rose_node = rose_node_list; while (rose_node != NULL) { if ((rose_node->mask == 10) && (rosecmpm(address, &rose_node->address, 10) == 0) && rose_node->loopback) break; rose_node = rose_node->next; } if (rose_node == NULL) goto out; rose_remove_node(rose_node); rose_loopback_neigh->count--; out: spin_unlock_bh(&rose_node_list_lock); } /* * A device has been removed. Remove its routes and neighbours. */ void rose_rt_device_down(struct net_device *dev) { struct rose_neigh *s, *rose_neigh; struct rose_node *t, *rose_node; int i; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); rose_neigh = rose_neigh_list; while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; if (s->dev != dev) continue; rose_node = rose_node_list; while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; for (i = 0; i < t->count; i++) { if (t->neighbour[i] != s) continue; t->count--; switch (i) { case 0: t->neighbour[0] = t->neighbour[1]; fallthrough; case 1: t->neighbour[1] = t->neighbour[2]; break; case 2: break; } } if (t->count <= 0) rose_remove_node(t); } rose_remove_neigh(s); } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); } #if 0 /* Currently unused */ /* * A device has been removed. Remove its links. */ void rose_route_device_down(struct net_device *dev) { struct rose_route *s, *rose_route; spin_lock_bh(&rose_route_list_lock); rose_route = rose_route_list; while (rose_route != NULL) { s = rose_route; rose_route = rose_route->next; if (s->neigh1->dev == dev || s->neigh2->dev == dev) rose_remove_route(s); } spin_unlock_bh(&rose_route_list_lock); } #endif /* * Clear all nodes and neighbours out, except for neighbours with * active connections going through them. * Do not clear loopback neighbour and nodes. */ static int rose_clear_routes(void) { struct rose_neigh *s, *rose_neigh; struct rose_node *t, *rose_node; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); rose_neigh = rose_neigh_list; rose_node = rose_node_list; while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; if (!t->loopback) rose_remove_node(t); } while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; if (s->use == 0 && !s->loopback) { s->count = 0; rose_remove_neigh(s); } } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); return 0; } /* * Check that the device given is a valid AX.25 interface that is "up". * called with RTNL */ static struct net_device *rose_ax25_dev_find(char *devname) { struct net_device *dev; if ((dev = __dev_get_by_name(&init_net, devname)) == NULL) return NULL; if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; return NULL; } /* * Find the first active ROSE device, usually "rose0". */ struct net_device *rose_dev_first(void) { struct net_device *dev, *first = NULL; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } if (first) dev_hold(first); rcu_read_unlock(); return first; } /* * Find the ROSE device for the given address. */ struct net_device *rose_dev_get(rose_address *addr) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } } dev = NULL; out: rcu_read_unlock(); return dev; } static int rose_dev_exists(rose_address *addr) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) goto out; } dev = NULL; out: rcu_read_unlock(); return dev != NULL; } struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neigh) { struct rose_route *rose_route; for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) if ((rose_route->neigh1 == neigh && rose_route->lci1 == lci) || (rose_route->neigh2 == neigh && rose_route->lci2 == lci)) return rose_route; return NULL; } /* * Find a neighbour or a route given a ROSE address. */ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, unsigned char *diagnostic, int route_frame) { struct rose_neigh *res = NULL; struct rose_node *node; int failed = 0; int i; if (!route_frame) spin_lock_bh(&rose_node_list_lock); for (node = rose_node_list; node != NULL; node = node->next) { if (rosecmpm(addr, &node->address, node->mask) == 0) { for (i = 0; i < node->count; i++) { if (node->neighbour[i]->restarted) { res = node->neighbour[i]; goto out; } } } } if (!route_frame) { /* connect request */ for (node = rose_node_list; node != NULL; node = node->next) { if (rosecmpm(addr, &node->address, node->mask) == 0) { for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; goto out; } failed = 1; } } } } if (failed) { *cause = ROSE_OUT_OF_ORDER; *diagnostic = 0; } else { *cause = ROSE_NOT_OBTAINABLE; *diagnostic = 0; } out: if (!route_frame) spin_unlock_bh(&rose_node_list_lock); return res; } /* * Handle the ioctls that control the routing functions. */ int rose_rt_ioctl(unsigned int cmd, void __user *arg) { struct rose_route_struct rose_route; struct net_device *dev; int err; switch (cmd) { case SIOCADDRT: if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) return -EFAULT; if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) return -EINVAL; if (rose_dev_exists(&rose_route.address)) /* Can't add routes to ourself */ return -EINVAL; if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ return -EINVAL; if (rose_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; err = rose_add_node(&rose_route, dev); return err; case SIOCDELRT: if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) return -EFAULT; if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) return -EINVAL; err = rose_del_node(&rose_route, dev); return err; case SIOCRSCLRRT: return rose_clear_routes(); default: return -EINVAL; } return 0; } static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) { struct rose_route *rose_route, *s; rose_neigh->restarted = 0; rose_stop_t0timer(rose_neigh); rose_start_ftimer(rose_neigh); skb_queue_purge(&rose_neigh->queue); spin_lock_bh(&rose_route_list_lock); rose_route = rose_route_list; while (rose_route != NULL) { if ((rose_route->neigh1 == rose_neigh && rose_route->neigh2 == rose_neigh) || (rose_route->neigh1 == rose_neigh && rose_route->neigh2 == NULL) || (rose_route->neigh2 == rose_neigh && rose_route->neigh1 == NULL)) { s = rose_route->next; rose_remove_route(rose_route); rose_route = s; continue; } if (rose_route->neigh1 == rose_neigh) { rose_route->neigh1->use--; rose_route->neigh1 = NULL; rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); } if (rose_route->neigh2 == rose_neigh) { rose_route->neigh2->use--; rose_route->neigh2 = NULL; rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); } rose_route = rose_route->next; } spin_unlock_bh(&rose_route_list_lock); } /* * A level 2 link has timed out, therefore it appears to be a poor link, * then don't use that neighbour until it is reset. Blow away all through * routes and connections using this route. */ void rose_link_failed(ax25_cb *ax25, int reason) { struct rose_neigh *rose_neigh; spin_lock_bh(&rose_neigh_list_lock); rose_neigh = rose_neigh_list; while (rose_neigh != NULL) { if (rose_neigh->ax25 == ax25) break; rose_neigh = rose_neigh->next; } if (rose_neigh != NULL) { rose_neigh->ax25 = NULL; ax25_cb_put(ax25); rose_del_route_by_neigh(rose_neigh); rose_kill_by_neigh(rose_neigh); } spin_unlock_bh(&rose_neigh_list_lock); } /* * A device has been "downed" remove its link status. Blow away all * through routes and connections that use this device. */ void rose_link_device_down(struct net_device *dev) { struct rose_neigh *rose_neigh; for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { if (rose_neigh->dev == dev) { rose_del_route_by_neigh(rose_neigh); rose_kill_by_neigh(rose_neigh); } } } /* * Route a frame to an appropriate AX.25 connection. * A NULL ax25_cb indicates an internally generated frame. */ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) { struct rose_neigh *rose_neigh, *new_neigh; struct rose_route *rose_route; struct rose_facilities_struct facilities; rose_address *src_addr, *dest_addr; struct sock *sk; unsigned short frametype; unsigned int lci, new_lci; unsigned char cause, diagnostic; struct net_device *dev; int res = 0; char buf[11]; if (skb->len < ROSE_MIN_LEN) return res; if (!ax25) return rose_loopback_queue(skb, NULL); frametype = skb->data[2]; lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); if (frametype == ROSE_CALL_REQUEST && (skb->len <= ROSE_CALL_REQ_FACILITIES_OFF || skb->data[ROSE_CALL_REQ_ADDR_LEN_OFF] != ROSE_CALL_REQ_ADDR_LEN_VAL)) return res; src_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_SRC_ADDR_OFF); dest_addr = (rose_address *)(skb->data + ROSE_CALL_REQ_DEST_ADDR_OFF); spin_lock_bh(&rose_neigh_list_lock); spin_lock_bh(&rose_route_list_lock); rose_neigh = rose_neigh_list; while (rose_neigh != NULL) { if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 && ax25->ax25_dev->dev == rose_neigh->dev) break; rose_neigh = rose_neigh->next; } if (rose_neigh == NULL) { printk("rose_route : unknown neighbour or device %s\n", ax2asc(buf, &ax25->dest_addr)); goto out; } /* * Obviously the link is working, halt the ftimer. */ rose_stop_ftimer(rose_neigh); /* * LCI of zero is always for us, and its always a restart * frame. */ if (lci == 0) { rose_link_rx_restart(skb, rose_neigh, frametype); goto out; } /* * Find an existing socket. */ if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) { if (frametype == ROSE_CALL_REQUEST) { struct rose_sock *rose = rose_sk(sk); /* Remove an existing unused socket */ rose_clear_queues(sk); rose->cause = ROSE_NETWORK_CONGESTION; rose->diagnostic = 0; rose->neighbour->use--; rose->neighbour = NULL; rose->lci = 0; rose->state = ROSE_STATE_0; sk->sk_state = TCP_CLOSE; sk->sk_err = 0; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } } else { skb_reset_transport_header(skb); res = rose_process_rx_frame(sk, skb); goto out; } } /* * Is is a Call Request and is it for us ? */ if (frametype == ROSE_CALL_REQUEST) if ((dev = rose_dev_get(dest_addr)) != NULL) { res = rose_rx_call_request(skb, dev, rose_neigh, lci); dev_put(dev); goto out; } if (!sysctl_rose_routing_control) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 0); goto out; } /* * Route it to the next in line if we have an entry for it. */ rose_route = rose_route_list; while (rose_route != NULL) { if (rose_route->lci1 == lci && rose_route->neigh1 == rose_neigh) { if (frametype == ROSE_CALL_REQUEST) { /* F6FBB - Remove an existing unused route */ rose_remove_route(rose_route); break; } else if (rose_route->neigh2 != NULL) { skb->data[0] &= 0xF0; skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; rose_transmit_link(skb, rose_route->neigh2); if (frametype == ROSE_CLEAR_CONFIRMATION) rose_remove_route(rose_route); res = 1; goto out; } else { if (frametype == ROSE_CLEAR_CONFIRMATION) rose_remove_route(rose_route); goto out; } } if (rose_route->lci2 == lci && rose_route->neigh2 == rose_neigh) { if (frametype == ROSE_CALL_REQUEST) { /* F6FBB - Remove an existing unused route */ rose_remove_route(rose_route); break; } else if (rose_route->neigh1 != NULL) { skb->data[0] &= 0xF0; skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F; skb->data[1] = (rose_route->lci1 >> 0) & 0xFF; rose_transmit_link(skb, rose_route->neigh1); if (frametype == ROSE_CLEAR_CONFIRMATION) rose_remove_route(rose_route); res = 1; goto out; } else { if (frametype == ROSE_CLEAR_CONFIRMATION) rose_remove_route(rose_route); goto out; } } rose_route = rose_route->next; } /* * We know that: * 1. The frame isn't for us, * 2. It isn't "owned" by any existing route. */ if (frametype != ROSE_CALL_REQUEST) { /* XXX */ res = 0; goto out; } memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, skb->len - ROSE_CALL_REQ_FACILITIES_OFF, &facilities)) { rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); goto out; } /* * Check for routing loops. */ rose_route = rose_route_list; while (rose_route != NULL) { if (rose_route->rand == facilities.rand && rosecmp(src_addr, &rose_route->src_addr) == 0 && ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 && ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120); goto out; } rose_route = rose_route->next; } if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic, 1)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic); goto out; } if ((new_lci = rose_new_lci(new_neigh)) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); goto out; } if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); goto out; } rose_route->lci1 = lci; rose_route->src_addr = *src_addr; rose_route->dest_addr = *dest_addr; rose_route->src_call = facilities.dest_call; rose_route->dest_call = facilities.source_call; rose_route->rand = facilities.rand; rose_route->neigh1 = rose_neigh; rose_route->lci2 = new_lci; rose_route->neigh2 = new_neigh; rose_route->neigh1->use++; rose_route->neigh2->use++; rose_route->next = rose_route_list; rose_route_list = rose_route; skb->data[0] &= 0xF0; skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; rose_transmit_link(skb, rose_route->neigh2); res = 1; out: spin_unlock_bh(&rose_route_list_lock); spin_unlock_bh(&rose_neigh_list_lock); return res; } #ifdef CONFIG_PROC_FS static void *rose_node_start(struct seq_file *seq, loff_t *pos) __acquires(rose_node_list_lock) { struct rose_node *rose_node; int i = 1; spin_lock_bh(&rose_node_list_lock); if (*pos == 0) return SEQ_START_TOKEN; for (rose_node = rose_node_list; rose_node && i < *pos; rose_node = rose_node->next, ++i); return (i == *pos) ? rose_node : NULL; } static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return (v == SEQ_START_TOKEN) ? rose_node_list : ((struct rose_node *)v)->next; } static void rose_node_stop(struct seq_file *seq, void *v) __releases(rose_node_list_lock) { spin_unlock_bh(&rose_node_list_lock); } static int rose_node_show(struct seq_file *seq, void *v) { char rsbuf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "address mask n neigh neigh neigh\n"); else { const struct rose_node *rose_node = v; seq_printf(seq, "%-10s %04d %d", rose2asc(rsbuf, &rose_node->address), rose_node->mask, rose_node->count); for (i = 0; i < rose_node->count; i++) seq_printf(seq, " %05d", rose_node->neighbour[i]->number); seq_puts(seq, "\n"); } return 0; } const struct seq_operations rose_node_seqops = { .start = rose_node_start, .next = rose_node_next, .stop = rose_node_stop, .show = rose_node_show, }; static void *rose_neigh_start(struct seq_file *seq, loff_t *pos) __acquires(rose_neigh_list_lock) { struct rose_neigh *rose_neigh; int i = 1; spin_lock_bh(&rose_neigh_list_lock); if (*pos == 0) return SEQ_START_TOKEN; for (rose_neigh = rose_neigh_list; rose_neigh && i < *pos; rose_neigh = rose_neigh->next, ++i); return (i == *pos) ? rose_neigh : NULL; } static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return (v == SEQ_START_TOKEN) ? rose_neigh_list : ((struct rose_neigh *)v)->next; } static void rose_neigh_stop(struct seq_file *seq, void *v) __releases(rose_neigh_list_lock) { spin_unlock_bh(&rose_neigh_list_lock); } static int rose_neigh_show(struct seq_file *seq, void *v) { char buf[11]; int i; if (v == SEQ_START_TOKEN) seq_puts(seq, "addr callsign dev count use mode restart t0 tf digipeaters\n"); else { struct rose_neigh *rose_neigh = v; /* if (!rose_neigh->loopback) { */ seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", rose_neigh->number, (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, rose_neigh->use, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, ax25_display_timer(&rose_neigh->ftimer) / HZ); if (rose_neigh->digipeat != NULL) { for (i = 0; i < rose_neigh->digipeat->ndigi; i++) seq_printf(seq, " %s", ax2asc(buf, &rose_neigh->digipeat->calls[i])); } seq_puts(seq, "\n"); } return 0; } const struct seq_operations rose_neigh_seqops = { .start = rose_neigh_start, .next = rose_neigh_next, .stop = rose_neigh_stop, .show = rose_neigh_show, }; static void *rose_route_start(struct seq_file *seq, loff_t *pos) __acquires(rose_route_list_lock) { struct rose_route *rose_route; int i = 1; spin_lock_bh(&rose_route_list_lock); if (*pos == 0) return SEQ_START_TOKEN; for (rose_route = rose_route_list; rose_route && i < *pos; rose_route = rose_route->next, ++i); return (i == *pos) ? rose_route : NULL; } static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return (v == SEQ_START_TOKEN) ? rose_route_list : ((struct rose_route *)v)->next; } static void rose_route_stop(struct seq_file *seq, void *v) __releases(rose_route_list_lock) { spin_unlock_bh(&rose_route_list_lock); } static int rose_route_show(struct seq_file *seq, void *v) { char buf[11], rsbuf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "lci address callsign neigh <-> lci address callsign neigh\n"); else { struct rose_route *rose_route = v; if (rose_route->neigh1) seq_printf(seq, "%3.3X %-10s %-9s %05d ", rose_route->lci1, rose2asc(rsbuf, &rose_route->src_addr), ax2asc(buf, &rose_route->src_call), rose_route->neigh1->number); else seq_puts(seq, "000 * * 00000 "); if (rose_route->neigh2) seq_printf(seq, "%3.3X %-10s %-9s %05d\n", rose_route->lci2, rose2asc(rsbuf, &rose_route->dest_addr), ax2asc(buf, &rose_route->dest_call), rose_route->neigh2->number); else seq_puts(seq, "000 * * 00000\n"); } return 0; } struct seq_operations rose_route_seqops = { .start = rose_route_start, .next = rose_route_next, .stop = rose_route_stop, .show = rose_route_show, }; #endif /* CONFIG_PROC_FS */ /* * Release all memory associated with ROSE routing structures. */ void __exit rose_rt_free(void) { struct rose_neigh *s, *rose_neigh = rose_neigh_list; struct rose_node *t, *rose_node = rose_node_list; struct rose_route *u, *rose_route = rose_route_list; while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; rose_remove_neigh(s); } while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; rose_remove_node(t); } while (rose_route != NULL) { u = rose_route; rose_route = rose_route->next; rose_remove_route(u); } } |
23 7 112 7 6 19 20 45 2 4 3 4 91 98 1 46 45 90 89 61 46 11 177 178 3 9 81 93 95 6 6 6 6 6 6 6 5 4 6 6 6 5 6 6 21 21 14 6 6 6 6 1 6 5 5 6 21 21 21 21 14 21 21 1 9 9 13 13 22 9 13 6 2 4 4 6 8 7 6 13 13 4 4 1 99 98 97 10 4 1 1 6 10 10 9 10 10 10 9 9 9 2 1 39 1 16 73 27 8 3 3 1 1 2 40 39 40 40 40 39 38 40 16 6 13 1 2 39 9 2 2 1 7 37 3 3 27 13 40 22 22 16 16 16 9 7 16 9 22 17 28 39 33 10 40 6 39 38 39 39 33 9 9 35 6 15 26 38 39 39 15 2 1 1 3 6 64 64 9 11 4 9 5 1 36 10 11 7 28 26 28 28 26 38 35 2 31 6 37 1 38 1 7 2 1 6 4 5 5 5 51 52 53 7 28 19 19 3 1 1 13 22 24 31 18 7 3 5 23 2 22 1 5 2 2 1 14 3 16 16 4 4 6 6 4 13 13 12 12 10 1 7 5 6 12 9 8 48 48 48 47 48 42 42 42 42 41 24 24 24 24 3 5 5 5 5 5 40 20 40 40 141 141 140 141 141 139 39 2 1 37 36 36 33 8 35 28 8 7 1 1 96 98 97 95 1 1 102 99 95 1 2 92 1 1 5 6 4 1 98 7 7 91 38 53 40 16 51 1 37 22 15 5 5 5 43 43 5 5 5 5 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 | // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_state.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki @USAGI * Split up af-specific functions * Derek Atkins <derek@ihtfp.com> * Add UDP Encapsulation * */ #include <linux/compat.h> #include <linux/workqueue.h> #include <net/xfrm.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/audit.h> #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <crypto/aead.h> #include "xfrm_hash.h" #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) 2. Hash table by (daddr,family,reqid) to find what SAs exist for given destination/tunnel endpoint. (output) */ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; static struct kmem_cache *xfrm_state_cache __ro_after_init; static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); static HLIST_HEAD(xfrm_state_gc_list); static HLIST_HEAD(xfrm_state_dev_gc_list); static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) { return refcount_inc_not_zero(&x->refcnt); } static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family) { return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } #define XFRM_STATE_INSERT(by, _n, _h, _type) \ { \ struct xfrm_state *_x = NULL; \ \ if (_type != XFRM_DEV_OFFLOAD_PACKET) { \ hlist_for_each_entry_rcu(_x, _h, by) { \ if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ continue; \ break; \ } \ } \ \ if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ /* SAD is empty or consist from HW SAs only */ \ hlist_add_head_rcu(_n, _h); \ else \ hlist_add_before_rcu(_n, &_x->by); \ } static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, struct hlist_head *nspitable, struct hlist_head *nseqtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_state *x; hlist_for_each_entry_safe(x, tmp, list, bydst) { unsigned int h; h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h, x->xso.type); } if (x->km.seq) { h = __xfrm_seq_hash(x->km.seq, nhashmask); XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h, x->xso.type); } } } static unsigned long xfrm_hash_new_size(unsigned int state_hmask) { return ((state_hmask + 1) << 1) * sizeof(struct hlist_head); } static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq; unsigned long nsize, osize; unsigned int nhashmask, ohashmask; int i; nsize = xfrm_hash_new_size(net->xfrm.state_hmask); ndst = xfrm_hash_alloc(nsize); if (!ndst) return; nsrc = xfrm_hash_alloc(nsize); if (!nsrc) { xfrm_hash_free(ndst, nsize); return; } nspi = xfrm_hash_alloc(nsize); if (!nspi) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); return; } nseq = xfrm_hash_alloc(nsize); if (!nseq) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); xfrm_hash_free(nspi, nsize); return; } spin_lock_bh(&net->xfrm.xfrm_state_lock); write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask); osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net); ohashmask = net->xfrm.state_hmask; rcu_assign_pointer(net->xfrm.state_bydst, ndst); rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); rcu_assign_pointer(net->xfrm.state_byspi, nspi); rcu_assign_pointer(net->xfrm.state_byseq, nseq); net->xfrm.state_hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_state_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_state_lock); osize = (ohashmask + 1) * sizeof(struct hlist_head); synchronize_rcu(); xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); xfrm_hash_free(oseq, osize); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO]; static DEFINE_SPINLOCK(xfrm_state_gc_lock); static DEFINE_SPINLOCK(xfrm_state_dev_gc_lock); int __xfrm_state_delete(struct xfrm_state *x); int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); static bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); int xfrm_register_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); int err = 0; if (!afinfo) return -EAFNOSUPPORT; #define X(afi, T, name) do { \ WARN_ON((afi)->type_ ## name); \ (afi)->type_ ## name = (T); \ } while (0) switch (type->proto) { case IPPROTO_COMP: X(afinfo, type, comp); break; case IPPROTO_AH: X(afinfo, type, ah); break; case IPPROTO_ESP: X(afinfo, type, esp); break; case IPPROTO_IPIP: X(afinfo, type, ipip); break; case IPPROTO_DSTOPTS: X(afinfo, type, dstopts); break; case IPPROTO_ROUTING: X(afinfo, type, routing); break; case IPPROTO_IPV6: X(afinfo, type, ipip6); break; default: WARN_ON(1); err = -EPROTONOSUPPORT; break; } #undef X rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return; #define X(afi, T, name) do { \ WARN_ON((afi)->type_ ## name != (T)); \ (afi)->type_ ## name = NULL; \ } while (0) switch (type->proto) { case IPPROTO_COMP: X(afinfo, type, comp); break; case IPPROTO_AH: X(afinfo, type, ah); break; case IPPROTO_ESP: X(afinfo, type, esp); break; case IPPROTO_IPIP: X(afinfo, type, ipip); break; case IPPROTO_DSTOPTS: X(afinfo, type, dstopts); break; case IPPROTO_ROUTING: X(afinfo, type, routing); break; case IPPROTO_IPV6: X(afinfo, type, ipip6); break; default: WARN_ON(1); break; } #undef X rcu_read_unlock(); } EXPORT_SYMBOL(xfrm_unregister_type); static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { const struct xfrm_type *type = NULL; struct xfrm_state_afinfo *afinfo; int modload_attempted = 0; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; switch (proto) { case IPPROTO_COMP: type = afinfo->type_comp; break; case IPPROTO_AH: type = afinfo->type_ah; break; case IPPROTO_ESP: type = afinfo->type_esp; break; case IPPROTO_IPIP: type = afinfo->type_ipip; break; case IPPROTO_DSTOPTS: type = afinfo->type_dstopts; break; case IPPROTO_ROUTING: type = afinfo->type_routing; break; case IPPROTO_IPV6: type = afinfo->type_ipip6; break; default: break; } if (unlikely(type && !try_module_get(type->owner))) type = NULL; rcu_read_unlock(); if (!type && !modload_attempted) { request_module("xfrm-type-%d-%d", family, proto); modload_attempted = 1; goto retry; } return type; } static void xfrm_put_type(const struct xfrm_type *type) { module_put(type->owner); } int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; switch (type->proto) { case IPPROTO_ESP: WARN_ON(afinfo->type_offload_esp); afinfo->type_offload_esp = type; break; default: WARN_ON(1); err = -EPROTONOSUPPORT; break; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type_offload); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return; switch (type->proto) { case IPPROTO_ESP: WARN_ON(afinfo->type_offload_esp != type); afinfo->type_offload_esp = NULL; break; default: WARN_ON(1); break; } rcu_read_unlock(); } EXPORT_SYMBOL(xfrm_unregister_type_offload); static const struct xfrm_type_offload * xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; switch (proto) { case IPPROTO_ESP: type = afinfo->type_offload_esp; break; default: break; } if ((type && !try_module_get(type->owner))) type = NULL; rcu_read_unlock(); if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); try_load = false; goto retry; } return type; } static void xfrm_put_type_offload(const struct xfrm_type_offload *type) { module_put(type->owner); } static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, [XFRM_MODE_TRANSPORT] = { .encap = XFRM_MODE_TRANSPORT, .family = AF_INET, }, [XFRM_MODE_TUNNEL] = { .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, }; static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, [XFRM_MODE_ROUTEOPTIMIZATION] = { .encap = XFRM_MODE_ROUTEOPTIMIZATION, .family = AF_INET6, }, [XFRM_MODE_TRANSPORT] = { .encap = XFRM_MODE_TRANSPORT, .family = AF_INET6, }, [XFRM_MODE_TUNNEL] = { .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) { const struct xfrm_mode *mode; if (unlikely(encap >= XFRM_MODE_MAX)) return NULL; switch (family) { case AF_INET: mode = &xfrm4_mode_map[encap]; if (mode->family == family) return mode; break; case AF_INET6: mode = &xfrm6_mode_map[encap]; if (mode->family == family) return mode; break; default: break; } return NULL; } void xfrm_state_free(struct xfrm_state *x) { kmem_cache_free(xfrm_state_cache, x); } EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); kfree(x->encap); kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); if (x->type_offload) xfrm_put_type_offload(x->type_offload); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); } if (x->xfrag.page) put_page(x->xfrag.page); xfrm_dev_state_free(x); security_xfrm_state_free(x); xfrm_state_free(x); } static void xfrm_state_gc_task(struct work_struct *work) { struct xfrm_state *x; struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); hlist_move_list(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) ___xfrm_state_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) { struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer); enum hrtimer_restart ret = HRTIMER_NORESTART; time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int err = 0; spin_lock(&x->lock); xfrm_dev_state_update_stats(x); if (x->km.state == XFRM_STATE_DEAD) goto out; if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { /* enter hard expire without soft expire first?! * setting a new date could trigger this. * workaround: fix x->curflt.add_time by below: */ x->curlft.add_time = now - x->saved_tmo - 1; tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; } else goto expired; } if (tmo < next) next = tmo; } if (x->lft.hard_use_expires_seconds) { time64_t tmo = x->lft.hard_use_expires_seconds + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; x->xflags &= ~XFRM_SOFT_EXPIRE; } else if (tmo < next) { next = tmo; x->xflags |= XFRM_SOFT_EXPIRE; x->saved_tmo = tmo; } } if (x->lft.soft_use_expires_seconds) { time64_t tmo = x->lft.soft_use_expires_seconds + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) next = tmo; } x->km.dying = warn; if (warn) km_state_expired(x, 0, 0); resched: if (next != TIME64_MAX) { hrtimer_forward_now(&x->mtimer, ktime_set(next, 0)); ret = HRTIMER_RESTART; } goto out; expired: if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) x->km.state = XFRM_STATE_EXPIRED; err = __xfrm_state_delete(x); if (!err) km_state_expired(x, 1, 0); xfrm_audit_state_delete(x, err ? 0 : 1, true); out: spin_unlock(&x->lock); return ret; } static void xfrm_replay_timer_handler(struct timer_list *t); struct xfrm_state *xfrm_state_alloc(struct net *net) { struct xfrm_state *x; x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC); if (x) { write_pnet(&x->xs_net, net); refcount_set(&x->refcnt, 1); atomic_set(&x->tunnel_users, 0); INIT_LIST_HEAD(&x->km.all); INIT_HLIST_NODE(&x->state_cache); INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); INIT_HLIST_NODE(&x->byseq); hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); x->mtimer.function = xfrm_timer_handler; timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; x->lft.hard_byte_limit = XFRM_INF; x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); } return x; } EXPORT_SYMBOL(xfrm_state_alloc); #ifdef CONFIG_XFRM_OFFLOAD void xfrm_dev_state_delete(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); if (dev) { dev->xfrmdev_ops->xdo_dev_state_delete(x); spin_lock_bh(&xfrm_state_dev_gc_lock); hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list); spin_unlock_bh(&xfrm_state_dev_gc_lock); } } EXPORT_SYMBOL_GPL(xfrm_dev_state_delete); void xfrm_dev_state_free(struct xfrm_state *x) { struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) hlist_del(&x->dev_gclist); spin_unlock_bh(&xfrm_state_dev_gc_lock); if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); WRITE_ONCE(xso->dev, NULL); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } #endif void __xfrm_state_destroy(struct xfrm_state *x, bool sync) { WARN_ON(x->km.state != XFRM_STATE_DEAD); if (sync) { synchronize_rcu(); ___xfrm_state_destroy(x); } else { spin_lock_bh(&xfrm_state_gc_lock); hlist_add_head(&x->gclist, &xfrm_state_gc_list); spin_unlock_bh(&xfrm_state_gc_lock); schedule_work(&xfrm_state_gc_work); } } EXPORT_SYMBOL(__xfrm_state_destroy); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); int err = -ESRCH; if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); if (x->km.seq) hlist_del_rcu(&x->byseq); if (!hlist_unhashed(&x->state_cache)) hlist_del_rcu(&x->state_cache); if (!hlist_unhashed(&x->state_cache_input)) hlist_del_rcu(&x->state_cache_input); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; xfrm_nat_keepalive_state_updated(x); spin_unlock(&net->xfrm.xfrm_state_lock); if (x->encap_sk) sock_put(rcu_dereference_raw(x->encap_sk)); xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. */ xfrm_state_put(x); err = 0; } return err; } EXPORT_SYMBOL(__xfrm_state_delete); int xfrm_state_delete(struct xfrm_state *x) { int err; spin_lock_bh(&x->lock); err = __xfrm_state_delete(x); spin_unlock_bh(&x->lock); return err; } EXPORT_SYMBOL(xfrm_state_delete); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { int i, err = 0; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (xfrm_id_proto_match(x->id.proto, proto) && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, task_valid); return err; } } } return err; } static inline int xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { int i, err = 0; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; struct xfrm_dev_offload *xso; hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; if (xso->dev == dev && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, task_valid); return err; } } } return err; } #else static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { return 0; } static inline int xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) { int i, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_flush_secctx_check(net, proto, task_valid); if (err) goto out; err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (!xfrm_state_kern(x) && xfrm_id_proto_match(x->id.proto, proto)) { xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); if (sync) xfrm_state_put_sync(x); else xfrm_state_put(x); if (!err) cnt++; spin_lock_bh(&net->xfrm.xfrm_state_lock); goto restart; } } } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) err = 0; return err; } EXPORT_SYMBOL(xfrm_state_flush); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid) { struct xfrm_state *x; struct hlist_node *tmp; struct xfrm_dev_offload *xso; int i, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid); if (err) goto out; err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; if (!xfrm_state_kern(x) && xso->dev == dev) { xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); xfrm_dev_state_free(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); xfrm_state_put(x); if (!err) cnt++; spin_lock_bh(&net->xfrm.xfrm_state_lock); goto restart; } } } if (cnt) err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); spin_lock_bh(&xfrm_state_dev_gc_lock); restart_gc: hlist_for_each_entry_safe(x, tmp, &xfrm_state_dev_gc_list, dev_gclist) { xso = &x->xso; if (xso->dev == dev) { spin_unlock_bh(&xfrm_state_dev_gc_lock); xfrm_dev_state_free(x); spin_lock_bh(&xfrm_state_dev_gc_lock); goto restart_gc; } } spin_unlock_bh(&xfrm_state_dev_gc_lock); xfrm_flush_gc(); return err; } EXPORT_SYMBOL(xfrm_dev_state_flush); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { spin_lock_bh(&net->xfrm.xfrm_state_lock); si->sadcnt = net->xfrm.state_num; si->sadhcnt = net->xfrm.state_hmask + 1; si->sadhmcnt = xfrm_state_hashmax; spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_sad_getinfo); static void __xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; sel->daddr.a4 = fl4->daddr; sel->saddr.a4 = fl4->saddr; sel->dport = xfrm_flowi_dport(fl, &fl4->uli); sel->dport_mask = htons(0xffff); sel->sport = xfrm_flowi_sport(fl, &fl4->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET; sel->prefixlen_d = 32; sel->prefixlen_s = 32; sel->proto = fl4->flowi4_proto; sel->ifindex = fl4->flowi4_oif; } static void __xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; /* Initialize temporary selector matching only to current session. */ *(struct in6_addr *)&sel->daddr = fl6->daddr; *(struct in6_addr *)&sel->saddr = fl6->saddr; sel->dport = xfrm_flowi_dport(fl, &fl6->uli); sel->dport_mask = htons(0xffff); sel->sport = xfrm_flowi_sport(fl, &fl6->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET6; sel->prefixlen_d = 128; sel->prefixlen_s = 128; sel->proto = fl6->flowi6_proto; sel->ifindex = fl6->flowi6_oif; } static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { switch (family) { case AF_INET: __xfrm4_init_tempsel(&x->sel, fl); break; case AF_INET6: __xfrm6_init_tempsel(&x->sel, fl); break; } x->id = tmpl->id; switch (tmpl->encap_family) { case AF_INET: if (x->id.daddr.a4 == 0) x->id.daddr.a4 = daddr->a4; x->props.saddr = tmpl->saddr; if (x->props.saddr.a4 == 0) x->props.saddr.a4 = saddr->a4; break; case AF_INET6: if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); break; } x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; x->props.family = tmpl->encap_family; } static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (xdo->dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; int cpu = get_cpu(); state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); rcu_read_lock(); hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; goto out; } x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); if (x && x->km.state == XFRM_STATE_VALID) { spin_lock_bh(&net->xfrm.xfrm_state_lock); if (hlist_unhashed(&x->state_cache_input)) { hlist_add_head_rcu(&x->state_cache_input, state_cache_input); } else { hlist_del_rcu(&x->state_cache_input); hlist_add_head_rcu(&x->state_cache_input, state_cache_input); } spin_unlock_bh(&net->xfrm.xfrm_state_lock); } out: rcu_read_unlock(); put_cpu(); return x; } EXPORT_SYMBOL(xfrm_input_state_lookup); static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { unsigned int h = xfrm_src_hash(net, daddr, saddr, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; if (use_spi) return __xfrm_state_lookup(net, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else return __xfrm_state_lookup_byaddr(net, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); } static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) { if (have_hash_collision && (net->xfrm.state_hmask + 1) < xfrm_state_hashmax && net->xfrm.state_num > net->xfrm.state_hmask) schedule_work(&net->xfrm.state_hash_work); } static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, const struct flowi *fl, unsigned short family, struct xfrm_state **best, int *acq_in_progress, int *error) { /* We need the cpu id just as a lookup key, * we don't require it to be stable. */ unsigned int pcpu_id = get_cpu(); put_cpu(); /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. * * Entering area of "sysdeps". * * 3. If state is not valid, selector is temporary, it selects * only session which triggered previous resolution. Key * manager will do something to install a state with proper * selector. */ if (x->km.state == XFRM_STATE_VALID) { if ((x->sel.family && (x->sel.family != family || !xfrm_selector_match(&x->sel, fl, family))) || !security_xfrm_state_pol_flow_match(x, pol, &fl->u.__fl_common)) return; if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id) return; if (!*best || ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) || (*best)->km.dying > x->km.dying || ((*best)->km.dying == x->km.dying && (*best)->curlft.add_time < x->curlft.add_time)) *best = x; } else if (x->km.state == XFRM_STATE_ACQ) { if (!*best || x->pcpu_num == pcpu_id) *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if ((!x->sel.family || (x->sel.family == family && xfrm_selector_match(&x->sel, fl, family))) && security_xfrm_state_pol_flow_match(x, pol, &fl->u.__fl_common)) *error = -ESRCH; } } struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; unsigned int pcpu_id; bool cached = false; /* We need the cpu id just as a lookup key, * we don't require it to be stable. */ pcpu_id = get_cpu(); put_cpu(); to_put = NULL; sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, encap_family, &best, &acquire_in_progress, &error); } if (best) goto cached; hlist_for_each_entry_rcu(x, &pol->state_cache_list, state_cache) { if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } cached: cached = true; if (best) goto found; else if (error) best = NULL; else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (pol->xdo.dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } if (best || acquire_in_progress) goto found; h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (pol->xdo.dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } found: if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || (best && (best->pcpu_num == pcpu_id))) x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup_all(net, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { to_put = x0; error = -EEXIST; goto out; } c.net = net; /* If the KMs have no listeners (yet...), avoid allocating an SA * for each and every packet - garbage collection might not * handle the flood. */ if (!km_is_alive(&c)) { error = -ESRCH; goto out; } x = xfrm_state_alloc(net); if (x == NULL) { error = -ENOMEM; goto out; } /* Initialize temporary state matching only * to current session. */ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); x->if_id = if_id; if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best) x->pcpu_num = pcpu_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; goto out; } #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { struct xfrm_dev_offload *xdo = &pol->xdo; struct xfrm_dev_offload *xso = &x->xso; xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; xso->dev = xdo->dev; xso->real_dev = xdo->real_dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); if (error) { xso->dir = 0; netdev_put(xso->dev, &xso->dev_tracker); xso->dev = NULL; xso->real_dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; goto out; } } #endif if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; x->dir = XFRM_SA_DIR_OUT; list_add(&x->km.all, &net->xfrm.state_all); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); INIT_HLIST_NODE(&x->state_cache); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { #ifdef CONFIG_XFRM_OFFLOAD struct xfrm_dev_offload *xso = &x->xso; if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { xfrm_dev_state_delete(x); xfrm_dev_state_free(x); } #endif x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; error = -ESRCH; } /* Use the already installed 'fallback' while the CPU-specific * SA acquire is handled*/ if (best) x = best; } out: if (x) { if (!xfrm_state_hold_rcu(x)) { *err = -EAGAIN; x = NULL; } } else { *err = acquire_in_progress ? -EAGAIN : error; } if (x && x->km.state == XFRM_STATE_VALID && !cached && (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || x->pcpu_num == pcpu_id)) { spin_lock_bh(&net->xfrm.xfrm_state_lock); if (hlist_unhashed(&x->state_cache)) hlist_add_head_rcu(&x->state_cache, &pol->state_cache_list); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); if (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)) { *err = -EAGAIN; if (x) { xfrm_state_put(x); x = NULL; } } return x; } struct xfrm_state * xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { unsigned int h; struct xfrm_state *rx = NULL, *x = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, family) && mode == x->props.mode && proto == x->id.proto && x->km.state == XFRM_STATE_VALID) { rx = x; break; } } if (rx) xfrm_state_hold(rx); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return rx; } EXPORT_SYMBOL(xfrm_stateonly_find); struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family) { struct xfrm_state *x; struct xfrm_state_walk *w; spin_lock_bh(&net->xfrm.xfrm_state_lock); list_for_each_entry(w, &net->xfrm.state_all, all) { x = container_of(w, struct xfrm_state, km); if (x->props.family != family || x->id.spi != spi) continue; xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } spin_unlock_bh(&net->xfrm.xfrm_state_lock); return NULL; } EXPORT_SYMBOL(xfrm_state_lookup_byspi); static void __xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); unsigned int h; list_add(&x->km.all, &net->xfrm.state_all); h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, x->xso.type); } hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); xfrm_nat_keepalive_state_updated(x); } /* net->xfrm.xfrm_state_lock is held */ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) { struct net *net = xs_net(xnew); unsigned short family = xnew->props.family; u32 reqid = xnew->props.reqid; struct xfrm_state *x; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; u32 if_id = xnew->if_id; u32 cpu_id = xnew->pcpu_num; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && x->pcpu_num == cpu_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) x->genid++; } } void xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); spin_lock_bh(&net->xfrm.xfrm_state_lock); __xfrm_state_bump_genids(x); __xfrm_state_insert(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_insert); /* net->xfrm.xfrm_state_lock is held */ static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) { unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); struct xfrm_state *x; u32 mark = m->v & m->m; hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.reqid != reqid || x->props.mode != mode || x->props.family != family || x->km.state != XFRM_STATE_ACQ || x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || x->pcpu_num != pcpu_num || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; xfrm_state_hold(x); return x; } if (!create) return NULL; x = xfrm_state_alloc(net); if (likely(x)) { switch (family) { case AF_INET: x->sel.daddr.a4 = daddr->a4; x->sel.saddr.a4 = saddr->a4; x->sel.prefixlen_d = 32; x->sel.prefixlen_s = 32; x->props.saddr.a4 = saddr->a4; x->id.daddr.a4 = daddr->a4; break; case AF_INET6: x->sel.daddr.in6 = daddr->in6; x->sel.saddr.in6 = saddr->in6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; x->props.saddr.in6 = saddr->in6; x->id.daddr.in6 = daddr->in6; break; } x->pcpu_num = pcpu_num; x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; x->props.family = family; x->props.mode = mode; x->props.reqid = reqid; x->if_id = if_id; x->mark.v = m->v; x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; xfrm_state_hold(x); hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); } return x; } static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_add(struct xfrm_state *x) { struct net *net = xs_net(x); struct xfrm_state *x1, *to_put; int family; int err; u32 mark = x->mark.v & x->mark.m; int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); family = x->props.family; to_put = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, family); if (x1) { to_put = x1; x1 = NULL; err = -EEXIST; goto out; } if (use_spi && x->km.seq) { x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num); if (x1 && ((x1->id.proto != x->id.proto) || !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; x1 = NULL; } } if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, x->props.reqid, x->if_id, x->pcpu_num, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); __xfrm_state_insert(x); err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (x1) { xfrm_state_delete(x1); xfrm_state_put(x1); } if (to_put) xfrm_state_put(to_put); return err; } EXPORT_SYMBOL(xfrm_state_add); #ifdef CONFIG_XFRM_MIGRATE static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security) { struct xfrm_user_sec_ctx *uctx; int size = sizeof(*uctx) + security->ctx_len; int err; uctx = kmalloc(size, GFP_KERNEL); if (!uctx) return -ENOMEM; uctx->exttype = XFRMA_SEC_CTX; uctx->len = size; uctx->ctx_doi = security->ctx_doi; uctx->ctx_alg = security->ctx_alg; uctx->ctx_len = security->ctx_len; memcpy(uctx + 1, security->ctx_str, security->ctx_len); err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) return err; return 0; } static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, struct xfrm_encap_tmpl *encap) { struct net *net = xs_net(orig); struct xfrm_state *x = xfrm_state_alloc(net); if (!x) goto out; memcpy(&x->id, &orig->id, sizeof(x->id)); memcpy(&x->sel, &orig->sel, sizeof(x->sel)); memcpy(&x->lft, &orig->lft, sizeof(x->lft)); x->props.mode = orig->props.mode; x->props.replay_window = orig->props.replay_window; x->props.reqid = orig->props.reqid; x->props.family = orig->props.family; x->props.saddr = orig->props.saddr; if (orig->aalg) { x->aalg = xfrm_algo_auth_clone(orig->aalg); if (!x->aalg) goto error; } x->props.aalgo = orig->props.aalgo; if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); x->geniv = orig->geniv; if (!x->aead) goto error; } if (orig->ealg) { x->ealg = xfrm_algo_clone(orig->ealg); if (!x->ealg) goto error; } x->props.ealgo = orig->props.ealgo; if (orig->calg) { x->calg = xfrm_algo_clone(orig->calg); if (!x->calg) goto error; } x->props.calgo = orig->props.calgo; if (encap || orig->encap) { if (encap) x->encap = kmemdup(encap, sizeof(*x->encap), GFP_KERNEL); else x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL); if (!x->encap) goto error; } if (orig->security) if (clone_security(x, orig->security)) goto error; if (orig->coaddr) { x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr), GFP_KERNEL); if (!x->coaddr) goto error; } if (orig->replay_esn) { if (xfrm_replay_clone(x, orig)) goto error; } memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; x->pcpu_num = orig->pcpu_num; x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft)); x->km.state = orig->km.state; x->km.seq = orig->km.seq; x->replay = orig->replay; x->preplay = orig->preplay; x->mapping_maxage = orig->mapping_maxage; x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; x->dir = orig->dir; return x; error: xfrm_state_put(x); out: return NULL; } struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); if (m->reqid) { h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr, m->reqid, m->old_family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; if (m->reqid && x->props.reqid != m->reqid) continue; if (if_id != 0 && x->if_id != if_id) continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, m->old_family)) continue; xfrm_state_hold(x); break; } } else { h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr, m->old_family); hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; if (if_id != 0 && x->if_id != if_id) continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, m->old_family)) continue; xfrm_state_hold(x); break; } } spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_migrate_state_find); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap) { struct xfrm_state *xc; xc = xfrm_state_clone(x, encap); if (!xc) return NULL; xc->props.family = m->new_family; if (xfrm_init_state(xc) < 0) goto error; memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); /* add state */ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { /* a care is needed when the destination address of the state is to be updated as it is a part of triplet */ xfrm_state_insert(xc); } else { if (xfrm_state_add(xc) < 0) goto error; } return xc; error: xfrm_state_put(xc); return NULL; } EXPORT_SYMBOL(xfrm_state_migrate); #endif int xfrm_state_update(struct xfrm_state *x) { struct xfrm_state *x1, *to_put; int err; int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); struct net *net = xs_net(x); to_put = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, x->props.family); err = -ESRCH; if (!x1) goto out; if (xfrm_state_kern(x1)) { to_put = x1; err = -EEXIST; goto out; } if (x1->km.state == XFRM_STATE_ACQ) { if (x->dir && x1->dir != x->dir) goto out; __xfrm_state_insert(x); x = NULL; } else { if (x1->dir != x->dir) goto out; } err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (to_put) xfrm_state_put(to_put); if (err) return err; if (!x) { xfrm_state_delete(x1); xfrm_state_put(x1); return 0; } err = -EINVAL; spin_lock_bh(&x1->lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { if (x->encap && x1->encap && x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); else if (x->encap || x1->encap) goto fail; if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel))) memcpy(&x1->sel, &x->sel, sizeof(x1->sel)); memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); x1->km.dying = 0; hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { spin_lock_bh(&net->xfrm.xfrm_state_lock); if (x->props.smark.m || x->props.smark.v) x1->props.smark = x->props.smark; if (x->if_id) x1->if_id = x->if_id; __xfrm_state_bump_genids(x1); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } err = 0; x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } fail: spin_unlock_bh(&x1->lock); xfrm_state_put(x1); return err; } EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_stats(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { x->km.state = XFRM_STATE_EXPIRED; hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT); return -EINVAL; } if (!x->km.dying && (x->curlft.bytes >= x->lft.soft_byte_limit || x->curlft.packets >= x->lft.soft_packet_limit)) { x->km.dying = 1; km_state_expired(x, 0, 0); } return 0; } EXPORT_SYMBOL(xfrm_state_check_expire); void xfrm_state_update_stats(struct net *net) { struct xfrm_state *x; int i; spin_lock_bh(&net->xfrm.xfrm_state_lock); for (i = 0; i <= net->xfrm.state_hmask; i++) { hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst) xfrm_dev_state_update_stats(x); } spin_unlock_bh(&net->xfrm.xfrm_state_lock); } struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { struct xfrm_state *x; rcu_read_lock(); x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } EXPORT_SYMBOL(xfrm_state_lookup); struct xfrm_state * xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num, proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_find_acq); #ifdef CONFIG_XFRM_SUB_POLICY #if IS_ENABLED(CONFIG_IPV6) /* distribution counting sort function for xfrm_state and xfrm_tmpl */ static void __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(const void *p), int maxclass) { int count[XFRM_MAX_DEPTH] = { }; int class[XFRM_MAX_DEPTH]; int i; for (i = 0; i < n; i++) { int c = cmp(src[i]); class[i] = c; count[c]++; } for (i = 2; i < maxclass; i++) count[i] += count[i - 1]; for (i = 0; i < n; i++) { dst[count[class[i] - 1]++] = src[i]; src[i] = NULL; } } /* Rule for xfrm_state: * * rule 1: select IPsec transport except AH * rule 2: select MIPv6 RO or inbound trigger * rule 3: select IPsec transport AH * rule 4: select IPsec tunnel * rule 5: others */ static int __xfrm6_state_sort_cmp(const void *p) { const struct xfrm_state *v = p; switch (v->props.mode) { case XFRM_MODE_TRANSPORT: if (v->id.proto != IPPROTO_AH) return 1; else return 3; #if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: return 4; } return 5; } /* Rule for xfrm_tmpl: * * rule 1: select IPsec transport * rule 2: select MIPv6 RO or inbound trigger * rule 3: select IPsec tunnel * rule 4: others */ static int __xfrm6_tmpl_sort_cmp(const void *p) { const struct xfrm_tmpl *v = p; switch (v->mode) { case XFRM_MODE_TRANSPORT: return 1; #if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: return 3; } return 4; } #else static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; } static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; } static inline void __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(const void *p), int maxclass) { int i; for (i = 0; i < n; i++) dst[i] = src[i]; } #endif /* CONFIG_IPV6 */ void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family) { int i; if (family == AF_INET6) __xfrm6_sort((void **)dst, (void **)src, n, __xfrm6_tmpl_sort_cmp, 5); else for (i = 0; i < n; i++) dst[i] = src[i]; } void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { int i; if (family == AF_INET6) __xfrm6_sort((void **)dst, (void **)src, n, __xfrm6_state_sort_cmp, 6); else for (i = 0; i < n; i++) dst[i] = src[i]; } #endif /* Silly enough, but I'm lazy to build resolution list */ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && x->pcpu_num == pcpu_num && x->km.state == XFRM_STATE_ACQ) { xfrm_state_hold(x); return x; } } return NULL; } struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_find_acq_byseq); u32 xfrm_get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } EXPORT_SYMBOL(xfrm_get_acqseq); int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack) { switch (proto) { case IPPROTO_AH: case IPPROTO_ESP: break; case IPPROTO_COMP: /* IPCOMP spi is 16-bits. */ if (max >= 0x10000) { NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535"); return -EINVAL; } break; default: NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP"); return -EINVAL; } if (min > max) { NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max"); return -EINVAL; } return 0; } EXPORT_SYMBOL(verify_spi_info); int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, struct netlink_ext_ack *extack) { struct net *net = xs_net(x); unsigned int h; struct xfrm_state *x0; int err = -ENOENT; __be32 minspi = htonl(low); __be32 maxspi = htonl(high); __be32 newspi = 0; u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_DEAD) { NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state"); goto unlock; } err = 0; if (x->id.spi) goto unlock; err = -ENOENT; if (minspi == maxspi) { x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); if (x0) { NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); xfrm_state_put(x0); goto unlock; } newspi = minspi; } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { spi = get_random_u32_inclusive(low, high); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); break; } xfrm_state_put(x0); } } if (newspi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; } else { NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); } unlock: spin_unlock_bh(&x->lock); return err; } EXPORT_SYMBOL(xfrm_alloc_spi); static bool __xfrm_state_filter_match(struct xfrm_state *x, struct xfrm_address_filter *filter) { if (filter) { if ((filter->family == AF_INET || filter->family == AF_INET6) && x->props.family != filter->family) return false; return addr_match(&x->props.saddr, &filter->saddr, filter->splen) && addr_match(&x->id.daddr, &filter->daddr, filter->dplen); } return true; } int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *data) { struct xfrm_state *state; struct xfrm_state_walk *x; int err = 0; if (walk->seq != 0 && list_empty(&walk->all)) return 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else x = list_first_entry(&walk->all, struct xfrm_state_walk, all); list_for_each_entry_from(x, &net->xfrm.state_all, all) { if (x->state == XFRM_STATE_DEAD) continue; state = container_of(x, struct xfrm_state, km); if (!xfrm_id_proto_match(state->id.proto, walk->proto)) continue; if (!__xfrm_state_filter_match(state, walk->filter)) continue; err = func(state, walk->seq, data); if (err) { list_move_tail(&walk->all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { err = -ENOENT; goto out; } list_del_init(&walk->all); out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_walk); void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto, struct xfrm_address_filter *filter) { INIT_LIST_HEAD(&walk->all); walk->proto = proto; walk->state = XFRM_STATE_DEAD; walk->seq = 0; walk->filter = filter; } EXPORT_SYMBOL(xfrm_state_walk_init); void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net) { kfree(walk->filter); if (list_empty(&walk->all)) return; spin_lock_bh(&net->xfrm.xfrm_state_lock); list_del(&walk->all); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_walk_done); static void xfrm_replay_timer_handler(struct timer_list *t) { struct xfrm_state *x = from_timer(x, t, rtimer); spin_lock(&x->lock); if (x->km.state == XFRM_STATE_VALID) { if (xfrm_aevent_is_on(xs_net(x))) xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT); else x->xflags |= XFRM_TIME_DEFER; } spin_unlock(&x->lock); } static LIST_HEAD(xfrm_km_list); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify_policy) km->notify_policy(xp, dir, c); rcu_read_unlock(); } void km_state_notify(struct xfrm_state *x, const struct km_event *c) { struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify) km->notify(x, c); rcu_read_unlock(); } EXPORT_SYMBOL(km_policy_notify); EXPORT_SYMBOL(km_state_notify); void km_state_expired(struct xfrm_state *x, int hard, u32 portid) { struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_EXPIRE; km_state_notify(x, &c); } EXPORT_SYMBOL(km_state_expired); /* * We send to all registered managers regardless of failure * We are happy with one success */ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) { int err = -EINVAL, acqret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { acqret = km->acquire(x, t, pol); if (!acqret) err = acqret; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_query); static int __km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { int err = -EINVAL; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->new_mapping) err = km->new_mapping(x, ipaddr, sport); if (!err) break; } rcu_read_unlock(); return err; } int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { int ret = 0; if (x->mapping_maxage) { if ((jiffies / HZ - x->new_mapping) > x->mapping_maxage || x->new_mapping_sport != sport) { x->new_mapping_sport = sport; x->new_mapping = jiffies / HZ; ret = __km_new_mapping(x, ipaddr, sport); } } else { ret = __km_new_mapping(x, ipaddr, sport); } return ret; } EXPORT_SYMBOL(km_new_mapping); void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid) { struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_POLEXPIRE; km_policy_notify(pol, dir, &c); } EXPORT_SYMBOL(km_policy_expired); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int err = -EINVAL; int ret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k, encap); if (!ret) err = ret; } } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_migrate); #endif int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) { int err = -EINVAL; int ret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->report) { ret = km->report(net, proto, sel, addr); if (!ret) err = ret; } } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_report); static bool km_is_alive(const struct km_event *c) { struct xfrm_mgr *km; bool is_alive = false; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->is_alive && km->is_alive(c)) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } #if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) static DEFINE_SPINLOCK(xfrm_translator_lock); static struct xfrm_translator __rcu *xfrm_translator; struct xfrm_translator *xfrm_get_translator(void) { struct xfrm_translator *xtr; rcu_read_lock(); xtr = rcu_dereference(xfrm_translator); if (unlikely(!xtr)) goto out; if (!try_module_get(xtr->owner)) xtr = NULL; out: rcu_read_unlock(); return xtr; } EXPORT_SYMBOL_GPL(xfrm_get_translator); void xfrm_put_translator(struct xfrm_translator *xtr) { module_put(xtr->owner); } EXPORT_SYMBOL_GPL(xfrm_put_translator); int xfrm_register_translator(struct xfrm_translator *xtr) { int err = 0; spin_lock_bh(&xfrm_translator_lock); if (unlikely(xfrm_translator != NULL)) err = -EEXIST; else rcu_assign_pointer(xfrm_translator, xtr); spin_unlock_bh(&xfrm_translator_lock); return err; } EXPORT_SYMBOL_GPL(xfrm_register_translator); int xfrm_unregister_translator(struct xfrm_translator *xtr) { int err = 0; spin_lock_bh(&xfrm_translator_lock); if (likely(xfrm_translator != NULL)) { if (rcu_access_pointer(xfrm_translator) != xtr) err = -EINVAL; else RCU_INIT_POINTER(xfrm_translator, NULL); } spin_unlock_bh(&xfrm_translator_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(xfrm_unregister_translator); #endif int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) { int err; u8 *data; struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); __sk_dst_reset(sk); return 0; } if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; data = memdup_sockptr(optval, optlen); if (IS_ERR(data)) return PTR_ERR(data); if (in_compat_syscall()) { struct xfrm_translator *xtr = xfrm_get_translator(); if (!xtr) { kfree(data); return -EOPNOTSUPP; } err = xtr->xlate_user_policy_sockptr(&data, optlen); xfrm_put_translator(xtr); if (err) { kfree(data); return err; } } err = -EINVAL; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { pol = km->compile_policy(sk, optname, data, optlen, &err); if (err >= 0) break; } rcu_read_unlock(); if (err >= 0) { xfrm_sk_policy_insert(sk, err, pol); xfrm_pol_put(pol); __sk_dst_reset(sk); err = 0; } kfree(data); return err; } EXPORT_SYMBOL(xfrm_user_policy); static DEFINE_SPINLOCK(xfrm_km_lock); void xfrm_register_km(struct xfrm_mgr *km) { spin_lock_bh(&xfrm_km_lock); list_add_tail_rcu(&km->list, &xfrm_km_list); spin_unlock_bh(&xfrm_km_lock); } EXPORT_SYMBOL(xfrm_register_km); void xfrm_unregister_km(struct xfrm_mgr *km) { spin_lock_bh(&xfrm_km_lock); list_del_rcu(&km->list); spin_unlock_bh(&xfrm_km_lock); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_unregister_km); int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0; if (WARN_ON(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -EEXIST; else rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_state_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_state_register_afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0, family = afinfo->family; if (WARN_ON(family >= NPROTO)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo) err = -EINVAL; else RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); } spin_unlock_bh(&xfrm_state_afinfo_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family) { if (unlikely(family >= NPROTO)) return NULL; return rcu_dereference(xfrm_state_afinfo[family]); } EXPORT_SYMBOL_GPL(xfrm_state_afinfo_get_rcu); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; if (unlikely(family >= NPROTO)) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_state_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } void xfrm_flush_gc(void) { flush_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(xfrm_flush_gc); /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; if (atomic_read(&t->tunnel_users) == 2) xfrm_state_delete(t); atomic_dec(&t->tunnel_users); xfrm_state_put_sync(t); x->tunnel = NULL; } } EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; u32 blksize, net_adj = 0; if (x->km.state != XFRM_STATE_VALID || !type || type->proto != IPPROTO_ESP) return mtu - x->props.header_len; aead = x->data; blksize = ALIGN(crypto_aead_blocksize(aead), 4); switch (x->props.mode) { case XFRM_MODE_TRANSPORT: case XFRM_MODE_BEET: if (x->props.family == AF_INET) net_adj = sizeof(struct iphdr); else if (x->props.family == AF_INET6) net_adj = sizeof(struct ipv6hdr); break; case XFRM_MODE_TUNNEL: break; default: WARN_ON_ONCE(1); break; } return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; int family = x->props.family; int err; if (family == AF_INET && READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)) x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; if (x->sel.family != AF_UNSPEC) { inner_mode = xfrm_get_mode(x->props.mode, x->sel.family); if (inner_mode == NULL) { NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; } if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) && family != x->sel.family) { NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family"); goto error; } x->inner_mode = *inner_mode; } else { const struct xfrm_mode *inner_mode_iaf; int iafamily = AF_INET; inner_mode = xfrm_get_mode(x->props.mode, x->props.family); if (inner_mode == NULL) { NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; } x->inner_mode = *inner_mode; if (x->props.family == AF_INET) iafamily = AF_INET6; inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily); if (inner_mode_iaf) { if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL) x->inner_mode_iaf = *inner_mode_iaf; } } x->type = xfrm_get_type(x->id.proto, family); if (x->type == NULL) { NL_SET_ERR_MSG(extack, "Requested type not found"); goto error; } x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload); err = x->type->init_state(x, extack); if (err) goto error; outer_mode = xfrm_get_mode(x->props.mode, family); if (!outer_mode) { NL_SET_ERR_MSG(extack, "Requested mode not found"); err = -EPROTONOSUPPORT; goto error; } x->outer_mode = *outer_mode; if (init_replay) { err = xfrm_init_replay(x, extack); if (err) goto error; } if (x->nat_keepalive_interval) { if (x->dir != XFRM_SA_DIR_OUT) { NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs"); err = -EINVAL; goto error; } if (!x->encap || x->encap->encap_type != UDP_ENCAP_ESPINUDP) { NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for UDP encapsulation"); err = -EINVAL; goto error; } } error: return err; } EXPORT_SYMBOL(__xfrm_init_state); int xfrm_init_state(struct xfrm_state *x) { int err; err = __xfrm_init_state(x, true, false, NULL); if (!err) x->km.state = XFRM_STATE_VALID; return err; } EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) { unsigned int sz; if (net_eq(net, &init_net)) xfrm_state_cache = KMEM_CACHE(xfrm_state, SLAB_HWCACHE_ALIGN | SLAB_PANIC); INIT_LIST_HEAD(&net->xfrm.state_all); sz = sizeof(struct hlist_head) * 8; net->xfrm.state_bydst = xfrm_hash_alloc(sz); if (!net->xfrm.state_bydst) goto out_bydst; net->xfrm.state_bysrc = xfrm_hash_alloc(sz); if (!net->xfrm.state_bysrc) goto out_bysrc; net->xfrm.state_byspi = xfrm_hash_alloc(sz); if (!net->xfrm.state_byspi) goto out_byspi; net->xfrm.state_byseq = xfrm_hash_alloc(sz); if (!net->xfrm.state_byseq) goto out_byseq; net->xfrm.state_cache_input = alloc_percpu(struct hlist_head); if (!net->xfrm.state_cache_input) goto out_state_cache_input; net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); spin_lock_init(&net->xfrm.xfrm_state_lock); seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation, &net->xfrm.xfrm_state_lock); return 0; out_state_cache_input: xfrm_hash_free(net->xfrm.state_byseq, sz); out_byseq: xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: xfrm_hash_free(net->xfrm.state_bysrc, sz); out_bysrc: xfrm_hash_free(net->xfrm.state_bydst, sz); out_bydst: return -ENOMEM; } void xfrm_state_fini(struct net *net) { unsigned int sz; flush_work(&net->xfrm.state_hash_work); flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.state_byseq)); xfrm_hash_free(net->xfrm.state_byseq, sz); WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); xfrm_hash_free(net->xfrm.state_bysrc, sz); WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); free_percpu(net->xfrm.state_cache_input); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_helper_sainfo(struct xfrm_state *x, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = x->security; u32 spi = ntohl(x->id.spi); if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (x->props.family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4 dst=%pI4", &x->props.saddr.a4, &x->id.daddr.a4); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6 dst=%pI6", x->props.saddr.a6, x->id.daddr.a6); break; } audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); } static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, struct audit_buffer *audit_buf) { const struct iphdr *iph4; const struct ipv6hdr *iph6; switch (family) { case AF_INET: iph4 = ip_hdr(skb); audit_log_format(audit_buf, " src=%pI4 dst=%pI4", &iph4->saddr, &iph4->daddr); break; case AF_INET6: iph6 = ipv6_hdr(skb); audit_log_format(audit_buf, " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x", &iph6->saddr, &iph6->daddr, iph6->flow_lbl[0] & 0x0f, iph6->flow_lbl[1], iph6->flow_lbl[2]); break; } } void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_add); void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_delete); void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-replay-overflow"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); /* don't record the sequence number because it's inherent in this kind * of audit message */ spi = ntohl(x->id.spi); audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow); void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-replayed-pkt"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); spi = ntohl(x->id.spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_replay); void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SA-notfound"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, family, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple); void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-notfound"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, family, audit_buf); spi = ntohl(net_spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound); void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto) { struct audit_buffer *audit_buf; __be32 net_spi; __be32 net_seq; audit_buf = xfrm_audit_start("SA-icv-failure"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) { u32 spi = ntohl(net_spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); } audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail); #endif /* CONFIG_AUDITSYSCALL */ |
5 2 2 5 5 34 34 34 34 1 28 28 29 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2003-2005 Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2015 Intel Deutschland GmbH * Copyright (C) 2021-2023 Intel Corporation */ #include <linux/kobject.h> #include <linux/slab.h> #include "ieee80211_i.h" #include "key.h" #include "debugfs.h" #include "debugfs_key.h" #define KEY_READ(name, prop, format_string) \ static ssize_t key_##name##_read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_key *key = file->private_data; \ return mac80211_format_buffer(userbuf, count, ppos, \ format_string, key->prop); \ } #define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n") #define KEY_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_OPS_W(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_##name##_read, \ .write = key_##name##_write, \ .llseek = generic_file_llseek, \ } #define KEY_FILE(name, format) \ KEY_READ_##format(name) \ KEY_OPS(name) #define KEY_CONF_READ(name, format_string) \ KEY_READ(conf_##name, conf.name, format_string) #define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n") #define KEY_CONF_OPS(name) \ static const struct debugfs_short_fops key_ ##name## _ops = { \ .read = key_conf_##name##_read, \ .llseek = generic_file_llseek, \ } #define KEY_CONF_FILE(name, format) \ KEY_CONF_READ_##format(name) \ KEY_CONF_OPS(name) KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { char buf[15]; struct ieee80211_key *key = file->private_data; u32 c = key->conf.cipher; sprintf(buf, "%.2x-%.2x-%.2x:%d\n", c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); static ssize_t key_tx_spec_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; u64 pn; int ret; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: return -EINVAL; case WLAN_CIPHER_SUITE_TKIP: /* not supported yet */ return -EOPNOTSUPP; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: ret = kstrtou64_from_user(userbuf, count, 16, &pn); if (ret) return ret; /* PN is a 48-bit counter */ if (pn >= (1ULL << 48)) return -ERANGE; atomic64_set(&key->conf.tx_pn, pn); return count; default: return 0; } } static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { u64 pn; char buf[20]; int len; struct ieee80211_key *key = file->private_data; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%08x %04x\n", TKIP_PN_TO_IV32(pn), TKIP_PN_TO_IV16(pn)); break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: pn = atomic64_read(&key->conf.tx_pn); len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", (u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24), (u8)(pn >> 16), (u8)(pn >> 8), (u8)pn); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS_W(tx_spec); static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[14*IEEE80211_NUM_TIDS+1], *p = buf; int i, len; const u8 *rpn; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_WEP40: case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < IEEE80211_NUM_TIDS; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", key->u.tkip.rx[i].iv32, key->u.tkip.rx[i].iv16); len = p - buf; break; case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: rpn = key->u.aes_gmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); len = p - buf; break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { rpn = key->u.gcmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", rpn[0], rpn[1], rpn[2], rpn[3], rpn[4], rpn[5]); } len = p - buf; break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(rx_spec); static ssize_t key_replays_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_CCMP: case WLAN_CIPHER_SUITE_CCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.replays); break; case WLAN_CIPHER_SUITE_GCMP: case WLAN_CIPHER_SUITE_GCMP_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.gcmp.replays); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(replays); static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_AES_CMAC: case WLAN_CIPHER_SUITE_BIP_CMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_gmac.icverrors); break; default: return 0; } return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(icverrors); static ssize_t key_mic_failures_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; char buf[20]; int len; if (key->conf.cipher != WLAN_CIPHER_SUITE_TKIP) return -EINVAL; len = scnprintf(buf, sizeof(buf), "%u\n", key->u.tkip.mic_failures); return simple_read_from_buffer(userbuf, count, ppos, buf, len); } KEY_OPS(mic_failures); static ssize_t key_key_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct ieee80211_key *key = file->private_data; int i, bufsize = 2 * key->conf.keylen + 2; char *buf = kmalloc(bufsize, GFP_KERNEL); char *p = buf; ssize_t res; if (!buf) return -ENOMEM; for (i = 0; i < key->conf.keylen; i++) p += scnprintf(p, bufsize + buf - p, "%02x", key->conf.key[i]); p += scnprintf(p, bufsize+buf-p, "\n"); res = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); kfree(buf); return res; } KEY_OPS(key); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, key->debugfs.dir, \ key, &key_##name##_ops) #define DEBUGFS_ADD_W(name) \ debugfs_create_file(#name, 0600, key->debugfs.dir, \ key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) { static int keycount; char buf[100]; struct sta_info *sta; if (!key->local->debugfs.keys) return; sprintf(buf, "%d", keycount); key->debugfs.cnt = keycount; keycount++; key->debugfs.dir = debugfs_create_dir(buf, key->local->debugfs.keys); sta = key->sta; if (sta) { sprintf(buf, "../../netdev:%s/stations/%pM", sta->sdata->name, sta->sta.addr); key->debugfs.stalink = debugfs_create_symlink("station", key->debugfs.dir, buf); } DEBUGFS_ADD(keylen); DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); DEBUGFS_ADD(algorithm); DEBUGFS_ADD_W(tx_spec); DEBUGFS_ADD(rx_spec); DEBUGFS_ADD(replays); DEBUGFS_ADD(icverrors); DEBUGFS_ADD(mic_failures); DEBUGFS_ADD(key); DEBUGFS_ADD(ifindex); }; void ieee80211_debugfs_key_remove(struct ieee80211_key *key) { if (!key) return; debugfs_remove_recursive(key->debugfs.dir); key->debugfs.dir = NULL; } void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; lockdep_assert_wiphy(sdata->local->hw.wiphy); debugfs_remove(sdata->debugfs.default_unicast_key); sdata->debugfs.default_unicast_key = NULL; if (sdata->default_unicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->default_unicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_unicast_key = debugfs_create_symlink("default_unicast_key", sdata->vif.debugfs_dir, buf); } debugfs_remove(sdata->debugfs.default_multicast_key); sdata->debugfs.default_multicast_key = NULL; if (sdata->deflink.default_multicast_key) { key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_multicast_key); sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_multicast_key = debugfs_create_symlink("default_multicast_key", sdata->vif.debugfs_dir, buf); } } void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_mgmt_key); if (key) { sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_mgmt_key = debugfs_create_symlink("default_mgmt_key", sdata->vif.debugfs_dir, buf); } else ieee80211_debugfs_key_remove_mgmt_default(sdata); } void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_mgmt_key); sdata->debugfs.default_mgmt_key = NULL; } void ieee80211_debugfs_key_add_beacon_default(struct ieee80211_sub_if_data *sdata) { char buf[50]; struct ieee80211_key *key; if (!sdata->vif.debugfs_dir) return; key = wiphy_dereference(sdata->local->hw.wiphy, sdata->deflink.default_beacon_key); if (key) { sprintf(buf, "../keys/%d", key->debugfs.cnt); sdata->debugfs.default_beacon_key = debugfs_create_symlink("default_beacon_key", sdata->vif.debugfs_dir, buf); } else { ieee80211_debugfs_key_remove_beacon_default(sdata); } } void ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) return; debugfs_remove(sdata->debugfs.default_beacon_key); sdata->debugfs.default_beacon_key = NULL; } void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, struct sta_info *sta) { debugfs_remove(key->debugfs.stalink); key->debugfs.stalink = NULL; } |
4 1 1 1 1 1 1 1 130 131 5 5 1 2 1 3 6 7 3 2 2 1 1 6 6 6 6 6 10 10 3 2 1 2 2 1 1 6 6 1 2 3 3 3 54 11 5 1 1 1 1 3 15 1 8 6 1 4 11 11 1 2 1 3 5 6 2 5 5 1 2 3 1 3 19 53 18 51 53 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 | // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); } static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->p.link); } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; err = gro_cells_init(&xi->gro_cells, dev); if (err) return err; dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return READ_ONCE(xi->net); } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, dev_to_kill); } } static struct pernet_operations xfrmi_net_ops = { .exit_batch_rtnl = xfrmi_exit_batch_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface"); |
7460 196 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> #include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mismatch error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static __always_inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static __always_inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static __always_inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static __always_inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static __always_inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); if (ret == MAX_NUMNODES) ret = __first_node(srcp); return ret; } static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static __always_inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static __always_inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static __always_inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static __always_inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static __always_inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static __always_inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static __always_inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static __always_inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static __always_inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static __always_inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static __always_inline int node_state(int node, enum node_states state) { return node == 0; } static __always_inline void node_set_state(int node, enum node_states state) { } static __always_inline void node_clear_state(int node, enum node_states state) { } static __always_inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif static __always_inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; w = nodes_weight(*maskp); switch (w) { case 0: bit = NUMA_NO_NODE; break; case 1: bit = first_node(*maskp); break; default: bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w)); break; } return bit; #else return 0; #endif } #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* Example structure for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */ |
1 1 11 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { return container_of(priv, struct Qdisc, privdata); } /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); #define NET_SCH_ALIAS_PREFIX "net-sch-" #define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #endif |
1 1 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 | // SPDX-License-Identifier: GPL-2.0 /* * seq_buf.c * * Copyright (C) 2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> * * The seq_buf is a handy tool that allows you to pass a descriptor around * to a buffer that other functions can write to. It is similar to the * seq_file functionality but has some differences. * * To use it, the seq_buf must be initialized with seq_buf_init(). * This will set up the counters within the descriptor. You can call * seq_buf_init() more than once to reset the seq_buf to start * from scratch. */ #include <linux/bug.h> #include <linux/err.h> #include <linux/export.h> #include <linux/hex.h> #include <linux/minmax.h> #include <linux/printk.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sprintf.h> #include <linux/string.h> #include <linux/types.h> #include <linux/uaccess.h> /** * seq_buf_can_fit - can the new data fit in the current buffer? * @s: the seq_buf descriptor * @len: The length to see if it can fit in the current buffer * * Returns: true if there's enough unused space in the seq_buf buffer * to fit the amount of new data according to @len. */ static bool seq_buf_can_fit(struct seq_buf *s, size_t len) { return s->len + len <= s->size; } /** * seq_buf_print_seq - move the contents of seq_buf into a seq_file * @m: the seq_file descriptor that is the destination * @s: the seq_buf descriptor that is the source. * * Returns: zero on success, non-zero otherwise. */ int seq_buf_print_seq(struct seq_file *m, struct seq_buf *s) { unsigned int len = seq_buf_used(s); return seq_write(m, s->buffer, len); } /** * seq_buf_vprintf - sequence printing of information. * @s: seq_buf descriptor * @fmt: printf format string * @args: va_list of arguments from a printf() type function * * Writes a vnprintf() format into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) { int len; WARN_ON(s->size == 0); if (s->len < s->size) { len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); if (s->len + len < s->size) { s->len += len; return 0; } } seq_buf_set_overflow(s); return -1; } /** * seq_buf_printf - sequence printing of information * @s: seq_buf descriptor * @fmt: printf format string * * Writes a printf() format into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = seq_buf_vprintf(s, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL_GPL(seq_buf_printf); /** * seq_buf_do_printk - printk() seq_buf line by line * @s: seq_buf descriptor * @lvl: printk level * * printk()-s a multi-line sequential buffer line by line. The function * makes sure that the buffer in @s is NUL-terminated and safe to read * as a string. */ void seq_buf_do_printk(struct seq_buf *s, const char *lvl) { const char *start, *lf; if (s->size == 0 || s->len == 0) return; start = seq_buf_str(s); while ((lf = strchr(start, '\n'))) { int len = lf - start + 1; printk("%s%.*s", lvl, len, start); start = ++lf; } /* No trailing LF */ if (start < s->buffer + s->len) printk("%s%s\n", lvl, start); } EXPORT_SYMBOL_GPL(seq_buf_do_printk); #ifdef CONFIG_BINARY_PRINTF /** * seq_buf_bprintf - Write the printf string from binary arguments * @s: seq_buf descriptor * @fmt: The format string for the @binary arguments * @binary: The binary arguments for @fmt. * * When recording in a fast path, a printf may be recorded with just * saving the format and the arguments as they were passed to the * function, instead of wasting cycles converting the arguments into * ASCII characters. Instead, the arguments are saved in a 32 bit * word array that is defined by the format string constraints. * * This function will take the format and the binary array and finish * the conversion into the ASCII string within the buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary) { unsigned int len = seq_buf_buffer_left(s); int ret; WARN_ON(s->size == 0); if (s->len < s->size) { ret = bstr_printf(s->buffer + s->len, len, fmt, binary); if (s->len + ret < s->size) { s->len += ret; return 0; } } seq_buf_set_overflow(s); return -1; } #endif /* CONFIG_BINARY_PRINTF */ /** * seq_buf_puts - sequence printing of simple string * @s: seq_buf descriptor * @str: simple string to record * * Copy a simple string into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_puts(struct seq_buf *s, const char *str) { size_t len = strlen(str); WARN_ON(s->size == 0); /* Add 1 to len for the trailing null byte which must be there */ len += 1; if (seq_buf_can_fit(s, len)) { memcpy(s->buffer + s->len, str, len); /* Don't count the trailing null byte against the capacity */ s->len += len - 1; return 0; } seq_buf_set_overflow(s); return -1; } EXPORT_SYMBOL_GPL(seq_buf_puts); /** * seq_buf_putc - sequence printing of simple character * @s: seq_buf descriptor * @c: simple character to record * * Copy a single character into the sequence buffer. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putc(struct seq_buf *s, unsigned char c) { WARN_ON(s->size == 0); if (seq_buf_can_fit(s, 1)) { s->buffer[s->len++] = c; return 0; } seq_buf_set_overflow(s); return -1; } EXPORT_SYMBOL_GPL(seq_buf_putc); /** * seq_buf_putmem - write raw data into the sequence buffer * @s: seq_buf descriptor * @mem: The raw memory to copy into the buffer * @len: The length of the raw memory to copy (in bytes) * * There may be cases where raw memory needs to be written into the * buffer and a strcpy() would not work. Using this function allows * for such cases. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) { WARN_ON(s->size == 0); if (seq_buf_can_fit(s, len)) { memcpy(s->buffer + s->len, mem, len); s->len += len; return 0; } seq_buf_set_overflow(s); return -1; } #define MAX_MEMHEX_BYTES 8U #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) /** * seq_buf_putmem_hex - write raw memory into the buffer in ASCII hex * @s: seq_buf descriptor * @mem: The raw memory to write its hex ASCII representation of * @len: The length of the raw memory to copy (in bytes) * * This is similar to seq_buf_putmem() except instead of just copying the * raw memory into the buffer it writes its ASCII representation of it * in hex characters. * * Returns: zero on success, -1 on overflow. */ int seq_buf_putmem_hex(struct seq_buf *s, const void *mem, unsigned int len) { unsigned char hex[HEX_CHARS]; const unsigned char *data = mem; unsigned int start_len; int i, j; WARN_ON(s->size == 0); BUILD_BUG_ON(MAX_MEMHEX_BYTES * 2 >= HEX_CHARS); while (len) { start_len = min(len, MAX_MEMHEX_BYTES); #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < start_len; i++) { #else for (i = start_len-1, j = 0; i >= 0; i--) { #endif hex[j++] = hex_asc_hi(data[i]); hex[j++] = hex_asc_lo(data[i]); } if (WARN_ON_ONCE(j == 0 || j/2 > len)) break; /* j increments twice per loop */ hex[j++] = ' '; seq_buf_putmem(s, hex, j); if (seq_buf_has_overflowed(s)) return -1; len -= start_len; data += start_len; } return 0; } /** * seq_buf_path - copy a path into the sequence buffer * @s: seq_buf descriptor * @path: path to write into the sequence buffer. * @esc: set of characters to escape in the output * * Write a path name into the sequence buffer. * * Returns: the number of written bytes on success, -1 on overflow. */ int seq_buf_path(struct seq_buf *s, const struct path *path, const char *esc) { char *buf; size_t size = seq_buf_get_buf(s, &buf); int res = -1; WARN_ON(s->size == 0); if (size) { char *p = d_path(path, buf, size); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); if (end) res = end - buf; } } seq_buf_commit(s, res); return res; } /** * seq_buf_to_user - copy the sequence buffer to user space * @s: seq_buf descriptor * @ubuf: The userspace memory location to copy to * @start: The first byte in the buffer to copy * @cnt: The amount to copy * * Copies the sequence buffer into the userspace memory pointed to * by @ubuf. It starts from @start and writes up to @cnt characters * or until it reaches the end of the content in the buffer (@s->len), * whichever comes first. * * Returns: * On success, it returns a positive number of the number of bytes * it copied. * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the * sequence (@s->len == @start). * * Returns -EFAULT if the copy to userspace fails. */ int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, size_t start, int cnt) { int len; int ret; if (!cnt) return 0; len = seq_buf_used(s); if (len <= start) return -EBUSY; len -= start; if (cnt > len) cnt = len; ret = copy_to_user(ubuf, s->buffer + start, cnt); if (ret == cnt) return -EFAULT; return cnt - ret; } /** * seq_buf_hex_dump - print formatted hex dump into the sequence buffer * @s: seq_buf descriptor * @prefix_str: string to prefix each line with; * caller supplies trailing spaces for alignment if desired * @prefix_type: controls whether prefix of an offset, address, or none * is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE) * @rowsize: number of bytes to print per line; must be 16 or 32 * @groupsize: number of bytes to print at a time (1, 2, 4, 8; default = 1) * @buf: data blob to dump * @len: number of bytes in the @buf * @ascii: include ASCII after the hex output * * Function is an analogue of print_hex_dump() and thus has similar interface. * * linebuf size is maximal length for one line. * 32 * 3 - maximum bytes per line, each printed into 2 chars + 1 for * separating space * 2 - spaces separating hex dump and ASCII representation * 32 - ASCII representation * 1 - terminating '\0' * * Returns: zero on success, -1 on overflow. */ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii) { const u8 *ptr = buf; int i, linelen, remaining = len; unsigned char linebuf[32 * 3 + 2 + 32 + 1]; int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; for (i = 0; i < len; i += rowsize) { linelen = min(remaining, rowsize); remaining -= rowsize; hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize, linebuf, sizeof(linebuf), ascii); switch (prefix_type) { case DUMP_PREFIX_ADDRESS: ret = seq_buf_printf(s, "%s%p: %s\n", prefix_str, ptr + i, linebuf); break; case DUMP_PREFIX_OFFSET: ret = seq_buf_printf(s, "%s%.8x: %s\n", prefix_str, i, linebuf); break; default: ret = seq_buf_printf(s, "%s%s\n", prefix_str, linebuf); break; } if (ret) return ret; } return 0; } |
1 21 1 1 1 1 21 1 1 18 1 2 44 14 1 1 7 1 1 1 25 5 32 4 28 26 19 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/ematch.c Extended Match API * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * An extended match (ematch) is a small classification tool not worth * writing a full classifier for. Ematches can be interconnected to form * a logic expression and get attached to classifiers to extend their * functionatlity. * * The userspace part transforms the logic expressions into an array * consisting of multiple sequences of interconnected ematches separated * by markers. Precedence is implemented by a special ematch kind * referencing a sequence beyond the marker of the current sequence * causing the current position in the sequence to be pushed onto a stack * to allow the current position to be overwritten by the position referenced * in the special ematch. Matching continues in the new sequence until a * marker is reached causing the position to be restored from the stack. * * Example: * A AND (B1 OR B2) AND C AND D * * ------->-PUSH------- * -->-- / -->-- \ -->-- * / \ / / \ \ / \ * +-------+-------+-------+-------+-------+--------+ * | A AND | B AND | C AND | D END | B1 OR | B2 END | * +-------+-------+-------+-------+-------+--------+ * \ / * --------<-POP--------- * * where B is a virtual ematch referencing to sequence starting with B1. * * ========================================================================== * * How to write an ematch in 60 seconds * ------------------------------------ * * 1) Provide a matcher function: * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, * struct tcf_pkt_info *info) * { * struct mydata *d = (struct mydata *) m->data; * * if (...matching goes here...) * return 1; * else * return 0; * } * * 2) Fill out a struct tcf_ematch_ops: * static struct tcf_ematch_ops my_ops = { * .kind = unique id, * .datalen = sizeof(struct mydata), * .match = my_match, * .owner = THIS_MODULE, * }; * * 3) Register/Unregister your ematch: * static int __init init_my_ematch(void) * { * return tcf_em_register(&my_ops); * } * * static void __exit exit_my_ematch(void) * { * tcf_em_unregister(&my_ops); * } * * module_init(init_my_ematch); * module_exit(exit_my_ematch); * * 4) By now you should have two more seconds left, barely enough to * open up a beer to watch the compilation going. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/pkt_cls.h> static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; read_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) { if (kind == e->kind) { if (!try_module_get(e->owner)) e = NULL; read_unlock(&ematch_mod_lock); return e; } } read_unlock(&ematch_mod_lock); return NULL; } /** * tcf_em_register - register an extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their presence. * The given @ops must have kind set to a unique identifier and the * callback match() must be implemented. All other callbacks are optional * and a fallback implementation is used instead. * * Returns -EEXISTS if an ematch of the same kind has already registered. */ int tcf_em_register(struct tcf_ematch_ops *ops) { int err = -EEXIST; struct tcf_ematch_ops *e; if (ops->match == NULL) return -EINVAL; write_lock(&ematch_mod_lock); list_for_each_entry(e, &ematch_ops, link) if (ops->kind == e->kind) goto errout; list_add_tail(&ops->link, &ematch_ops); err = 0; errout: write_unlock(&ematch_mod_lock); return err; } EXPORT_SYMBOL(tcf_em_register); /** * tcf_em_unregister - unregister and extended match * * @ops: ematch operations lookup table * * This function must be called by ematches to announce their disappearance * for examples when the module gets unloaded. The @ops parameter must be * the same as the one used for registration. * * Returns -ENOENT if no matching ematch was found. */ void tcf_em_unregister(struct tcf_ematch_ops *ops) { write_lock(&ematch_mod_lock); list_del(&ops->link); write_unlock(&ematch_mod_lock); } EXPORT_SYMBOL(tcf_em_unregister); static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, int index) { return &tree->matches[index]; } static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *tree_hdr, struct tcf_ematch *em, struct nlattr *nla, int idx) { int err = -EINVAL; struct tcf_ematch_hdr *em_hdr = nla_data(nla); int data_len = nla_len(nla) - sizeof(*em_hdr); void *data = (void *) em_hdr + sizeof(*em_hdr); struct net *net = tp->chain->block->net; if (!TCF_EM_REL_VALID(em_hdr->flags)) goto errout; if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index * referencing an external ematch sequence. */ u32 ref; if (data_len < sizeof(ref)) goto errout; ref = *(u32 *) data; if (ref >= tree_hdr->nmatches) goto errout; /* We do not allow backward jumps to avoid loops and jumps * to our own position are of course illegal. */ if (ref <= idx) goto errout; em->data = ref; } else { /* Note: This lookup will increase the module refcnt * of the ematch module referenced. In case of a failure, * a destroy function is called by the underlying layer * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the * module is held if the ops field is non zero. */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { err = -ENOENT; #ifdef CONFIG_MODULES __rtnl_unlock(); request_module("ematch-kind-%u", em_hdr->kind); rtnl_lock(); em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops) { /* We dropped the RTNL mutex in order to * perform the module load. Tell the caller * to replay the request. */ module_put(em->ops->owner); em->ops = NULL; err = -EAGAIN; } #endif goto errout; } /* ematch module provides expected length of data, so we * can do a basic sanity check. */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; if (em->ops->change) { err = -EINVAL; if (em_hdr->flags & TCF_EM_SIMPLE) goto errout; err = em->ops->change(net, data, data_len, em); if (err < 0) goto errout; } else if (data_len > 0) { /* ematch module doesn't provide an own change * procedure and expects us to allocate and copy * the ematch data. * * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather * the value carried. */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (em->ops->datalen > 0) goto errout; if (data_len < sizeof(u32)) goto errout; em->data = *(u32 *) data; } else { void *v = kmemdup(data, data_len, GFP_KERNEL); if (v == NULL) { err = -ENOBUFS; goto errout; } em->data = (unsigned long) v; } em->datalen = data_len; } } em->matchid = em_hdr->matchid; em->flags = em_hdr->flags; em->net = net; err = 0; errout: return err; } static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = { [TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) }, [TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED }, }; /** * tcf_em_tree_validate - validate ematch config TLV and build ematch tree * * @tp: classifier kind handle * @nla: ematch tree configuration TLV * @tree: destination ematch tree variable to store the resulting * ematch tree. * * This function validates the given configuration TLV @nla and builds an * ematch tree in @tree. The resulting tree must later be copied into * the private classifier data using tcf_em_tree_change(). You MUST NOT * provide the ematch tree variable of the private classifier data directly, * the changes would not be locked properly. * * Returns a negative error code if the configuration TLV contains errors. */ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, struct tcf_ematch_tree *tree) { int idx, list_len, matches_len, err; struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1]; struct nlattr *rt_match, *rt_hdr, *rt_list; struct tcf_ematch_tree_hdr *tree_hdr; struct tcf_ematch *em; memset(tree, 0, sizeof(*tree)); if (!nla) return 0; err = nla_parse_nested_deprecated(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); if (err < 0) goto errout; err = -EINVAL; rt_hdr = tb[TCA_EMATCH_TREE_HDR]; rt_list = tb[TCA_EMATCH_TREE_LIST]; if (rt_hdr == NULL || rt_list == NULL) goto errout; tree_hdr = nla_data(rt_hdr); memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); rt_match = nla_data(rt_list); list_len = nla_len(rt_list); matches_len = tree_hdr->nmatches * sizeof(*em); tree->matches = kzalloc(matches_len, GFP_KERNEL); if (tree->matches == NULL) goto errout; /* We do not use nla_parse_nested here because the maximum * number of attributes is unknown. This saves us the allocation * for a tb buffer which would serve no purpose at all. * * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking * to this policy will result in parsing failure. */ for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; if (rt_match->nla_type != (idx + 1)) goto errout_abort; if (idx >= tree_hdr->nmatches) goto errout_abort; if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr)) goto errout_abort; em = tcf_em_get_match(tree, idx); err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); if (err < 0) goto errout_abort; rt_match = nla_next(rt_match, &list_len); } /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to * undefined references during the matching process. */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; } err = 0; errout: return err; errout_abort: tcf_em_tree_destroy(tree); return err; } EXPORT_SYMBOL(tcf_em_tree_validate); /** * tcf_em_tree_destroy - destroy an ematch tree * * @tree: ematch tree to be deleted * * This functions destroys an ematch tree previously created by * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that * the ematch tree is not in use before calling this function. */ void tcf_em_tree_destroy(struct tcf_ematch_tree *tree) { int i; if (tree->matches == NULL) return; for (i = 0; i < tree->hdr.nmatches; i++) { struct tcf_ematch *em = tcf_em_get_match(tree, i); if (em->ops) { if (em->ops->destroy) em->ops->destroy(em); else if (!tcf_em_is_simple(em)) kfree((void *) em->data); module_put(em->ops->owner); } } tree->hdr.nmatches = 0; kfree(tree->matches); tree->matches = NULL; } EXPORT_SYMBOL(tcf_em_tree_destroy); /** * tcf_em_tree_dump - dump ematch tree into a rtnl message * * @skb: skb holding the rtnl message * @tree: ematch tree to be dumped * @tlv: TLV type to be used to encapsulate the tree * * This function dumps a ematch tree into a rtnl message. It is valid to * call this function while the ematch tree is in use. * * Returns -1 if the skb tailroom is insufficient. */ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) { int i; u8 *tail; struct nlattr *top_start; struct nlattr *list_start; top_start = nla_nest_start_noflag(skb, tlv); if (top_start == NULL) goto nla_put_failure; if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr)) goto nla_put_failure; list_start = nla_nest_start_noflag(skb, TCA_EMATCH_TREE_LIST); if (list_start == NULL) goto nla_put_failure; tail = skb_tail_pointer(skb); for (i = 0; i < tree->hdr.nmatches; i++) { struct nlattr *match_start = (struct nlattr *)tail; struct tcf_ematch *em = tcf_em_get_match(tree, i); struct tcf_ematch_hdr em_hdr = { .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, .matchid = em->matchid, .flags = em->flags }; if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr)) goto nla_put_failure; if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) goto nla_put_failure; } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { u32 u = em->data; nla_put_nohdr(skb, sizeof(u), &u); } else if (em->datalen > 0) nla_put_nohdr(skb, em->datalen, (void *) em->data); tail = skb_tail_pointer(skb); match_start->nla_len = tail - (u8 *)match_start; } nla_nest_end(skb, list_start); nla_nest_end(skb, top_start); return 0; nla_put_failure: return -1; } EXPORT_SYMBOL(tcf_em_tree_dump); static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); return tcf_em_is_inverted(em) ? !r : r; } /* Do not use this function directly, use tcf_em_tree_match instead */ int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, struct tcf_pkt_info *info) { int stackp = 0, match_idx = 0, res = 0; struct tcf_ematch *cur_match; int stack[CONFIG_NET_EMATCH_STACK]; proceed: while (match_idx < tree->hdr.nmatches) { cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_container(cur_match)) { if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) goto stack_overflow; stack[stackp++] = match_idx; match_idx = cur_match->data; goto proceed; } res = tcf_em_match(skb, cur_match, info); if (tcf_em_early_end(cur_match, res)) break; match_idx++; } pop_stack: if (stackp > 0) { match_idx = stack[--stackp]; cur_match = tcf_em_get_match(tree, match_idx); if (tcf_em_is_inverted(cur_match)) res = !res; if (tcf_em_early_end(cur_match, res)) { goto pop_stack; } else { match_idx++; goto proceed; } } return res; stack_overflow: net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match); |
458 861 458 861 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 * * Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/slab.h> #include <net/ipv6.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("ip6tables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, }; static unsigned int ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr, daddr; unsigned int ret, verdict; u32 flowlabel, mark; u8 hop_limit; int err; /* save source/dest address, mark, hoplimit, flowlabel, priority, */ memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr)); memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr)); mark = skb->mark; hop_limit = ipv6_hdr(skb)->hop_limit; /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); ret = ip6t_do_table(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || !ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) || skb->mark != mark || ipv6_hdr(skb)->hop_limit != hop_limit || flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) { err = ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ip6t_mangle_out(priv, skb, state); return ip6t_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; static int ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; int ret; repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { ip6t_unregister_table_pre_exit(net, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "mangle"); } static struct pernet_operations ip6table_mangle_net_ops = { .pre_exit = ip6table_mangle_net_pre_exit, .exit = ip6table_mangle_net_exit, }; static int __init ip6table_mangle_init(void) { int ret = xt_register_template(&packet_mangler, ip6table_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); return PTR_ERR(mangle_ops); } ret = register_pernet_subsys(&ip6table_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit ip6table_mangle_fini(void) { unregister_pernet_subsys(&ip6table_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(ip6table_mangle_init); module_exit(ip6table_mangle_fini); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Routines to manage notifier chains for passing status changes to any * interested routines. We need this instead of hard coded call lists so * that modules can poke their nose into the innards. The network devices * needed them so here they are for the rest of you. * * Alan Cox <Alan.Cox@linux.org> */ #ifndef _LINUX_NOTIFIER_H #define _LINUX_NOTIFIER_H #include <linux/errno.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/srcu.h> /* * Notifier chains are of four types: * * Atomic notifier chains: Chain callbacks run in interrupt/atomic * context. Callouts are not allowed to block. * Blocking notifier chains: Chain callbacks run in process context. * Callouts are allowed to block. * Raw notifier chains: There are no restrictions on callbacks, * registration, or unregistration. All locking and protection * must be provided by the caller. * SRCU notifier chains: A variant of blocking notifier chains, with * the same restrictions. * * atomic_notifier_chain_register() may be called from an atomic context, * but blocking_notifier_chain_register() and srcu_notifier_chain_register() * must be called from a process context. Ditto for the corresponding * _unregister() routines. * * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(), * and srcu_notifier_chain_unregister() _must not_ be called from within * the call chain. * * SRCU notifier chains are an alternative form of blocking notifier chains. * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for * protection of the chain links. This means there is _very_ low overhead * in srcu_notifier_call_chain(): no cache bounces and no memory barriers. * As compensation, srcu_notifier_chain_unregister() is rather expensive. * SRCU notifier chains should be used when the chain will be called very * often but notifier_blocks will seldom be removed. */ struct notifier_block; typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); struct notifier_block { notifier_fn_t notifier_call; struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; struct notifier_block __rcu *head; }; struct raw_notifier_head { struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_usage srcuu; struct srcu_struct srcu; struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ spin_lock_init(&(name)->lock); \ (name)->head = NULL; \ } while (0) #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ init_rwsem(&(name)->rwsem); \ (name)->head = NULL; \ } while (0) #define RAW_INIT_NOTIFIER_HEAD(name) do { \ (name)->head = NULL; \ } while (0) /* srcu_notifier_heads must be cleaned up dynamically */ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define srcu_cleanup_notifier_head(name) \ cleanup_srcu_struct(&(name)->srcu); #define ATOMIC_NOTIFIER_INIT(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = NULL } #define BLOCKING_NOTIFIER_INIT(name) { \ .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ .head = NULL } #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ ATOMIC_NOTIFIER_INIT(name) #define BLOCKING_NOTIFIER_HEAD(name) \ struct blocking_notifier_head name = \ BLOCKING_NOTIFIER_INIT(name) #define RAW_NOTIFIER_HEAD(name) \ struct raw_notifier_head name = \ RAW_NOTIFIER_INIT(name) #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) #else #define _SRCU_NOTIFIER_HEAD(name, mod) \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name) #endif #define SRCU_NOTIFIER_HEAD(name) \ _SRCU_NOTIFIER_HEAD(name, /* not static */) #define SRCU_NOTIFIER_HEAD_STATIC(name) \ _SRCU_NOTIFIER_HEAD(name, static) #ifdef __KERNEL__ extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_register_unique_prio( struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register_unique_prio( struct blocking_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v); extern int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v); extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh); #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ #define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. */ #define NOTIFY_STOP (NOTIFY_OK|NOTIFY_STOP_MASK) /* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */ static inline int notifier_from_errno(int err) { if (err) return NOTIFY_STOP_MASK | (NOTIFY_OK - err); return NOTIFY_OK; } /* Restore (negative) errno value from notify return value. */ static inline int notifier_to_errno(int ret) { ret &= ~NOTIFY_STOP_MASK; return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0; } /* * Declared notifiers so far. I can imagine quite a few more chains * over time (eg laptop power reset chains, reboot chain (to clean * device units up), device [un]mount chain, module load/unload chain, * low memory chain, screenblank chain (for plug in modular screenblankers) * VC switch chains (for loadable kernel svgalib VC switch helpers) etc... */ /* CPU notfiers are defined in include/linux/cpu.h. */ /* netdevice notifiers are defined in include/linux/netdevice.h */ /* reboot notifiers are defined in include/linux/reboot.h. */ /* Hibernation and suspend events are defined in include/linux/suspend.h. */ /* Virtual Terminal events are defined in include/linux/vt.h. */ #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ /* Console keyboard events. * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and * KBD_KEYSYM. */ #define KBD_KEYCODE 0x0001 /* Keyboard keycode, called before any other */ #define KBD_UNBOUND_KEYCODE 0x0002 /* Keyboard keycode which is not bound to any other */ #define KBD_UNICODE 0x0003 /* Keyboard unicode */ #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ |
54 16 206 251 53 114 60 53 53 53 53 139 139 140 140 72 114 121 60 137 139 30 138 140 44 53 53 53 53 53 28 52 53 53 51 52 52 53 53 53 53 52 53 28 53 53 52 40 53 14 14 28 28 28 28 28 13 13 13 13 13 13 13 44 43 43 44 44 161 162 162 23 161 202 184 193 142 44 194 191 205 204 203 203 205 205 53 53 202 51 194 13 13 194 44 193 72 60 19 3 62 23 198 198 130 196 196 196 139 13 89 126 2 2 1 1 1 1 1 1 139 62 49 36 36 37 61 62 62 98 62 6 62 51 64 1 63 6 69 6 2 2 166 2 161 79 79 4 1 50 1 1 1 43 2 6 2 6 3 5 41 44 5 58 49 50 73 12 748 752 743 872 879 54 870 881 575 873 871 341 363 320 354 313 334 30 30 51 777 767 4 772 774 71 744 744 740 96 97 48 70 89 89 35 8 164 114 108 2 86 98 105 100 4 1 105 5 98 15 89 15 89 9 97 95 2 46 91 25 26 26 26 25 25 25 25 25 25 25 25 25 21 13 13 13 13 12 13 13 13 13 13 13 13 13 13 13 13 13 13 132 132 129 130 130 130 132 132 132 131 132 132 59 132 131 1 131 130 13 132 13 11 1 12 3 8 5 7 5 3 1 3 5 12 12 13 12 38 38 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally described in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Code from fib_hash has been reused which includes the following header: * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ #include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_alias *fa, struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { .info.extack = extack, .dst = dst, .dst_len = dst_len, .fi = fa->fa_info, .dscp = fa->fa_dscp, .type = fa->fa_type, .tb_id = fa->tb_id, }; return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 #define KEYLENGTH (8*sizeof(t_key)) #define KEY_MAX ((t_key)~0) typedef unsigned int t_key; #define IS_TRIE(n) ((n)->pos >= KEYLENGTH) #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) struct key_vector { t_key key; unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; union { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; struct tnode { struct rcu_head rcu; t_key empty_children; /* KEYLENGTH bits needed */ t_key full_children; /* KEYLENGTH bits needed */ struct key_vector __rcu *parent; struct key_vector kv[1]; #define tn_bits kv[0].bits }; #define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) #define LEAF_SIZE TNODE_SIZE(1) #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped; }; #endif struct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int prefixes; unsigned int nodesizes[MAX_STAT_DEPTH]; }; struct trie { struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); static unsigned int tnode_free_size; /* * synchronize_rcu after call_rcu for outstanding dirty memory; it should be * especially useful before resizing the root node with PREEMPT_NONE configs; * the value was obtained experimentally, aiming to avoid visible slowdown. */ unsigned int sysctl_fib_sync_mem = 512 * 1024; unsigned int sysctl_fib_sync_mem_min = 64 * 1024; unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { return container_of(kv, struct tnode, kv[0]); } /* caller must hold RTNL */ #define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) #define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ #define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) #define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) rcu_assign_pointer(tn_info(n)->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } #define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) static inline unsigned long get_index(t_key key, struct key_vector *kv) { unsigned long index = key ^ kv->key; if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) return 0; return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is * necessary. Every node in the trie has a key associated with it, but not * all of the bits in that key are significant. * * Consider a node 'n' and its parent 'tp'. * * If n is a leaf, every bit in its key is significant. Its presence is * necessitated by path compression, since during a tree traversal (when * searching for a leaf - unless we are doing an insertion) we will completely * ignore all skipped bits we encounter. Thus we need to verify, at the end of * a potentially successful search, that we have indeed been walking the * correct key path. * * Note that we can never "miss" the correct key in the tree if present by * following the wrong path. Path compression ensures that segments of the key * that are the same for all keys with a given prefix are skipped, but the * skipped part *is* identical for each node in the subtrie below the skipped * bit! trie_insert() in this implementation takes care of that. * * if n is an internal node - a 'tnode' here, the various parts of its key * have many different meanings. * * Example: * _________________________________________________________________ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | * ----------------------------------------------------------------- * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 * * _________________________________________________________________ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | * ----------------------------------------------------------------- * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 * * tp->pos = 22 * tp->bits = 3 * n->pos = 13 * n->bits = 4 * * First, let's just ignore the bits that come before the parent tp, that is * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this * point we do not use them for anything. * * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the * index into the parent's child array. That is, they will be used to find * 'n' among tp's children. * * The bits from (n->pos + n->bits) to (tp->pos - 1) - "S" - are skipped bits * for the node n. * * All the bits we have seen so far are significant to the node n. The rest * of the bits are really not needed or indeed known in n->key. * * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into * n's child array, and will of course be different for each child. * * The rest of the bits, from 0 to (n->pos -1) - "u" - are completely unknown * at this point. */ static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; static inline void alias_free_mem_rcu(struct fib_alias *fa) { kfree_rcu(fa, rcu); } #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); else kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) static struct tnode *tnode_alloc(int bits) { size_t size; /* verify bits is within bounds */ if (bits > TNODE_VMALLOC_MAX) return NULL; /* determine size and verify it is non-zero and didn't overflow */ size = TNODE_SIZE(1ul << bits); if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } static inline void empty_child_inc(struct key_vector *n) { tn_info(n)->empty_children++; if (!tn_info(n)->empty_children) tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { if (!tn_info(n)->empty_children) tn_info(n)->full_children--; tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { struct key_vector *l; struct tnode *kv; kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (!kv) return NULL; /* initialize key vector */ l = kv->kv; l->key = key; l->pos = 0; l->bits = 0; l->slen = fa->fa_slen; /* link leaf to fib alias */ INIT_HLIST_HEAD(&l->leaf); hlist_add_head(&fa->fa_list, &l->leaf); return l; } static struct key_vector *tnode_new(t_key key, int pos, int bits) { unsigned int shift = pos + bits; struct key_vector *tn; struct tnode *tnode; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); tnode = tnode_alloc(bits); if (!tnode) return NULL; pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), sizeof(struct key_vector *) << bits); if (bits == KEYLENGTH) tnode->full_children = 1; else tnode->empty_children = 1ul << bits; tn = tnode->kv; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->pos = pos; tn->bits = bits; tn->slen = pos; return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ static void put_child(struct key_vector *tn, unsigned long i, struct key_vector *n) { struct key_vector *chi = get_child(tn, i); int isfull, wasfull; BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (!n && chi) empty_child_inc(tn); if (n && !chi) empty_child_dec(tn); /* update fullChildren */ wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn_info(tn)->full_children--; else if (!wasfull && isfull) tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; rcu_assign_pointer(tn->tnode[i], n); } static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); if (!inode) continue; /* Either update the children of a tnode that * already belongs to us or update the child * to point to ourselves. */ if (node_parent(inode) == tn) update_children(inode); else node_set_parent(inode, tn); } } static inline void put_child_root(struct key_vector *tp, t_key key, struct key_vector *n) { if (IS_TRIE(tp)) rcu_assign_pointer(tp->tnode[0], n); else put_child(tp, get_index(key, tp), n); } static inline void tnode_free_init(struct key_vector *tn) { tn_info(tn)->rcu.next = NULL; } static inline void tnode_free_append(struct key_vector *tn, struct key_vector *n) { tn_info(n)->rcu.next = tn_info(tn)->rcu.next; tn_info(tn)->rcu.next = &tn_info(n)->rcu; } static void tnode_free(struct key_vector *tn) { struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; synchronize_net(); } } static struct key_vector *replace(struct trie *t, struct key_vector *oldtnode, struct key_vector *tn) { struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); /* all pointers should be clean so we are done */ tnode_free(oldtnode); /* resize children now that oldtnode is freed */ for (i = child_length(tn); i;) { struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) tn = resize(t, inode); } return tp; } static struct key_vector *inflate(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; t_key m; pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { struct key_vector *inode = get_child(oldtnode, --i); struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ if (!inode) continue; /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { put_child(tn, get_index(inode->key, tn), inode); continue; } /* drop the node in the old tnode free list */ tnode_free_append(oldtnode, inode); /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2 * i + 1, get_child(inode, 1)); put_child(tn, 2 * i, get_child(inode, 0)); continue; } /* We will replace this node 'inode' with two new * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position * (tn->pos) - is the one that will differ between * node0 and node1. So... we synthesize that bit in the * two new keys. */ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!node1) goto nomem; node0 = tnode_new(inode->key, inode->pos, inode->bits - 1); tnode_free_append(tn, node1); if (!node0) goto nomem; tnode_free_append(tn, node0); /* populate child pointers in new nodes */ for (k = child_length(inode), j = k / 2; j;) { put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); put_child(node1, --j, get_child(inode, --k)); put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ NODE_INIT_PARENT(node1, tn); NODE_INIT_PARENT(node0, tn); /* link parent to nodes */ put_child(tn, 2 * i + 1, node1); put_child(tn, 2 * i, node0); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *halve(struct trie *t, struct key_vector *oldtnode) { struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); /* Assemble all of the pointers in our cluster, in this case that * represents all of the pointers out of our allocated nodes that * point to existing tnodes and the links between our allocated * nodes. */ for (i = child_length(oldtnode); i;) { struct key_vector *node1 = get_child(oldtnode, --i); struct key_vector *node0 = get_child(oldtnode, --i); struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { put_child(tn, i / 2, node1 ? : node0); continue; } /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); if (!inode) goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ put_child(inode, 1, node1); put_child(inode, 0, node0); NODE_INIT_PARENT(inode, tn); /* link parent to node */ put_child(tn, i / 2, inode); } /* setup the parent pointers into and out of this node */ return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); notnode: return NULL; } static struct key_vector *collapse(struct trie *t, struct key_vector *oldtnode) { struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ for (n = NULL, i = child_length(oldtnode); !n && i;) n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); return tp; } static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; unsigned char slen_max; /* only vector 0 can have a suffix length greater than or equal to * tn->pos + tn->bits, the second highest node will have a suffix * length at most of tn->pos + tn->bits - 1 */ slen_max = min_t(unsigned char, tn->pos + tn->bits - 1, tn->slen); /* search though the list of children looking for nodes that might * have a suffix greater than the one we currently have. This is * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; /* update stride and slen based on new value */ stride <<= (n->slen - slen); slen = n->slen; i &= ~(stride - 1); /* stop searching if we have hit the maximum possible value */ if (slen >= slen_max) break; } tn->slen = slen; return slen; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * * child_length(tn) * */ static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; used -= tn_info(tn)->empty_children; used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } static inline bool should_collapse(struct key_vector *tn) { unsigned long used = child_length(tn); used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ return used < 2; } #define MAX_WORK 10 static struct key_vector *resize(struct trie *t, struct key_vector *tn) { #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif struct key_vector *tp = node_parent(tn); unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* track the tnode via the pointer from the parent instead of * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { tp = inflate(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* update parent in case inflate failed */ tp = node_parent(tn); /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { tp = halve(t, tn); if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; tn = get_child(tp, cindex); } /* Only one child remains */ if (should_collapse(tn)) return collapse(t, tn); /* update parent in case halve failed */ return node_parent(tn); } static void node_pull_suffix(struct key_vector *tn, unsigned char slen) { unsigned char node_slen = tn->slen; while ((node_slen > tn->pos) && (node_slen > slen)) { slen = update_suffix(tn); if (node_slen == slen) break; tn = node_parent(tn); node_slen = tn->slen; } } static void node_push_suffix(struct key_vector *tn, unsigned char slen) { while (tn->slen < slen) { tn->slen = slen; tn = node_parent(tn); } } /* rcu_read_lock needs to be hold by caller from readside */ static struct key_vector *fib_find_node(struct trie *t, struct key_vector **tp, u32 key) { struct key_vector *pn, *n = t->kv; unsigned long index = 0; do { pn = n; n = get_child_rcu(n, index); if (!n) break; index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) { n = NULL; break; } /* keep searching until we find a perfect match leaf or NULL */ } while (IS_TNODE(n)); *tp = pn; return n; } /* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. * If 'find_first' is set, return the first matching * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, dscp_t dscp, u32 prio, u32 tb_id, bool find_first) { struct fib_alias *fa; if (!fah) return NULL; hlist_for_each_entry(fa, fah, fa_list) { /* Avoid Sparse warning when using dscp_t in inequalities */ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); u8 __dscp = inet_dscp_to_dsfield(dscp); if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) break; if (fa->tb_id > tb_id) continue; if (fa->tb_id != tb_id) break; if (find_first) return fa; if (__fa_dscp > __dscp) continue; if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } static struct fib_alias * fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) { u8 slen = KEYLENGTH - fri->dst_len; struct key_vector *l, *tp; struct fib_table *tb; struct fib_alias *fa; struct trie *t; tb = fib_get_table(net, fri->tb_id); if (!tb) return NULL; t = (struct trie *)tb->tb_data; l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); if (!l) return NULL; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && fa->fa_type == fri->type) return fa; } return NULL; } void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; rcu_read_lock(); fa_match = fib_find_matching_alias(net, fri); if (!fa_match) goto out; /* These are paired with the WRITE_ONCE() happening in this function. * The reason is that we are only protected by RCU at this point. */ if (READ_ONCE(fa_match->offload) == fri->offload && READ_ONCE(fa_match->trap) == fri->trap && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); if (!skb) { err = -ENOBUFS; goto errout; } err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); goto out; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) tn = resize(t, tn); } static int fib_insert_node(struct trie *t, struct key_vector *tp, struct fib_alias *new, t_key key) { struct key_vector *n, *l; l = leaf_new(key, new); if (!l) goto noleaf; /* retrieve child from parent node */ n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * * Add a new tnode here * first tnode need some special handling * leaves us in position for handling as case 3 */ if (n) { struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ tp = tn; } /* Case 3: n is NULL, and will just insert a new leaf */ node_push_suffix(tp, new->fa_slen); NODE_INIT_PARENT(l, tp); put_child_root(tp, key, l); trie_rebalance(t, tp); return 0; notnode: node_free(l); noleaf: return -ENOMEM; } static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) { if (!l) return fib_insert_node(t, tp, new, key); if (fa) { hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { struct fib_alias *last; hlist_for_each_entry(last, &l->leaf, fa_list) { if (new->fa_slen < last->fa_slen) break; if ((new->fa_slen == last->fa_slen) && (new->tb_id > last->tb_id)) break; fa = last; } if (fa) hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); else hlist_add_head_rcu(&new->fa_list, &l->leaf); } /* if we added to the tail node then we need to update slen */ if (l->slen < new->fa_slen) { l->slen = new->fa_slen; node_push_suffix(tp, new->fa_slen); } return 0; } static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) { if (plen > KEYLENGTH) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); return false; } if ((plen < KEYLENGTH) && (key << plen)) { NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); return false; } return true; } static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; u16 nlflags = NLM_F_EXCL; struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and * insert to the tail of the section matching the suffix length * of the new alias. */ if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; err = -EEXIST; if (cfg->fc_nlflags & NLM_F_EXCL) goto out; nlflags &= ~NLM_F_EXCL; /* We have 2 goals: * 1. Find exact match for type, scope, fib_info to avoid * duplicate routes * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it */ fa_match = NULL; fa_first = fa; hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; if (fa->fa_type == cfg->fc_type && fa->fa_info == fi) { fa_match = fa; break; } } if (cfg->fc_nlflags & NLM_F_REPLACE) { struct fib_info *fi_drop; u8 state; nlflags |= NLM_F_REPLACE; fa = fa_first; if (fa_match) { if (fa == fa_match) err = 0; goto out; } err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; fi_drop = fa->fa_info; new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) { hlist_replace_rcu(&new_fa->fa_list, &fa->fa_list); goto out_free_new_fa; } } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop * information. */ if (fa_match) goto out; if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; else fa = fa_first; } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) goto out; nlflags |= NLM_F_CREATE; err = -ENOBUFS; new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; new_fa->fa_info = fi; new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; new_fa->offload = 0; new_fa->trap = 0; new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; /* The alias was already inserted, so the node must exist. */ l = l ? l : fib_find_node(t, &tp, key); if (WARN_ON_ONCE(!l)) { err = -ENOENT; goto out_free_new_fa; } if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == new_fa) { enum fib_event_type fib_event; fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib_entry_notifiers(net, fib_event, key, plen, new_fa, extack); if (err) goto out_remove_new_fa; } if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; out_remove_new_fa: fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); err: return err; } static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; return (key ^ prefix) & (prefix | -prefix); } bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp) { if (nhc->nhc_flags & RTNH_F_DEAD) return false; if (ip_ignore_linkdown(nhc->nhc_dev) && nhc->nhc_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) return false; if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) return false; return true; } /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); struct key_vector *n, *pn; struct fib_alias *fa; unsigned long index; t_key cindex; pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); if (!n) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); #endif /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex * * This check is safe even if bits == KEYLENGTH due to the * fact that we can only allocate a node with 32 bits if a * long is greater than 32 bits. */ if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) goto found; /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ if (n->slen > n->pos) { pn = n; cindex = index; } n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ if (unlikely(IS_LEAF(n))) break; /* Don't bother recording parent info. Since we are in * prefix match mode we will have to come back to wherever * we started this traversal anyway */ while ((n = rcu_dereference(*cptr)) == NULL) { backtrace: #ifdef CONFIG_IP_FIB_TRIE_STATS if (!n) this_cpu_inc(stats->null_node_hit); #endif /* If we are at cindex 0 there are no more bits for * us to strip at this level so we must ascend back * up one level to see if there are any more bits to * be stripped there. */ while (!cindex) { t_key pkey = pn->key; /* If we don't have a parent then there is * nothing for us to do as we do not have any * further nodes to parse. */ if (IS_TRIE(pn)) { trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } /* strip the least significant bit from the cindex */ cindex &= cindex - 1; /* grab pointer for next child node */ cptr = &pn->tnode[cindex]; } } found: /* this line carries forward the xor from earlier in the function */ index = key ^ n->key; /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; /* Paired with WRITE_ONCE() in fib_release_info() */ if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; if (unlikely(fi->nh)) { if (nexthop_is_blackhole(fi->nh)) { err = fib_props[RTN_BLACKHOLE].error; goto out_reject; } nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, &nhsel); if (nhc) goto set_result; goto miss; } for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { nhc = fib_info_nhc(fi, nhsel); if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) continue; set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old) { /* record the location of the previous list_info entry */ struct hlist_node **pprev = old->fa_list.pprev; struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); /* remove the fib_alias from the list */ hlist_del_rcu(&old->fa_list); /* if we emptied the list this leaf will be freed and we can sort * out parent suffix lengths as a part of trie_rebalance */ if (hlist_empty(&l->leaf)) { if (tp->slen == l->slen) node_pull_suffix(tp, tp->pos); put_child_root(tp, l->key, NULL); node_free(l); trie_rebalance(t, tp); return; } /* only access fa if it is pointing at the last valid hlist_node */ if (*pprev) return; /* update the trie with the latest suffix length */ l->slen = fa->fa_slen; node_pull_suffix(tp, fa->fa_slen); } static void fib_notify_alias_delete(struct net *net, u32 key, struct hlist_head *fah, struct fib_alias *fa_to_delete, struct netlink_ext_ack *extack) { struct fib_alias *fa_next, *fa_to_notify; u32 tb_id = fa_to_delete->tb_id; u8 slen = fa_to_delete->fa_slen; enum fib_event_type fib_event; /* Do not notify if we do not care about the route. */ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) return; /* Determine if the route should be replaced by the next route in the * list. */ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, struct fib_alias, fa_list); if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { fib_event = FIB_EVENT_ENTRY_REPLACE; fa_to_notify = fa_next; } else { fib_event = FIB_EVENT_ENTRY_DEL; fa_to_notify = fa_to_delete; } call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, fa_to_notify, extack); } /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *fa_to_delete; struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); if (!fib_valid_key_len(key, plen, extack)) return -EINVAL; l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; dscp = cfg->fc_dscp; fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && (cfg->fc_scope == RT_SCOPE_NOWHERE || fa->fa_info->fib_scope == cfg->fc_scope) && (!cfg->fc_prefsrc || fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && fib_nh_match(net, cfg, fi, extack) == 0 && fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } } if (!fa_to_delete) return -ESRCH; fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; fib_remove_alias(t, tp, l, fa_to_delete); if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); alias_free_mem_rcu(fa_to_delete); return 0; } /* Scan for the next leaf starting at the provided key value */ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { struct key_vector *pn, *n = *tn; unsigned long cindex; /* this loop is meant to try and find the key in the trie */ do { /* record parent and next child index */ pn = n; cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; /* descend into the next child */ n = get_child_rcu(pn, cindex++); if (!n) break; /* guarantee forward progress on the keys */ if (IS_LEAF(n) && (n->key >= key)) goto found; } while (IS_TNODE(n)); /* this loop will search for the next leaf with a greater key */ while (!IS_TRIE(pn)) { /* if we exhausted the parent node we will need to climb */ if (cindex >= (1ul << pn->bits)) { t_key pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; continue; } /* grab the next available node */ n = get_child_rcu(pn, cindex++); if (!n) continue; /* no need to compare keys since we bumped the index */ if (IS_LEAF(n)) goto found; /* Rescan start scanning in new node */ pn = n; cindex = 0; } *tn = pn; return NULL; /* Root of trie */ found: /* if we are at the limit for keys just return NULL for the tnode */ *tn = pn; return n; } static void fib_trie_free(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order and free everything */ for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; n = pn; pn = node_parent(pn); /* drop emptied tnode */ put_child_root(pn, n->key, NULL); node_free(n); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); } put_child_root(pn, n->key, NULL); node_free(n); } #ifdef CONFIG_IP_FIB_TRIE_STATS free_percpu(t->stats); #endif kfree(tb); } struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { struct trie *ot = (struct trie *)oldtb->tb_data; struct key_vector *l, *tp = ot->kv; struct fib_table *local_tb; struct fib_alias *fa; struct trie *lt; t_key key = 0; if (oldtb->tb_data == oldtb->__data) return oldtb; local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); if (!local_tb) return NULL; lt = (struct trie *)local_tb->tb_data; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) continue; /* clone fa for new local table */ new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); if (!new_fa) goto out; memcpy(new_fa, fa, sizeof(*fa)); /* insert clone into table */ if (!local_l) local_l = fib_find_node(lt, &local_tp, l->key); if (fib_insert_alias(lt, local_tp, local_l, new_fa, NULL, l->key)) { kmem_cache_free(fn_alias_kmem, new_fa); goto out; } } /* stop loop if key wrapped back to 0 */ key = l->key + 1; if (key < l->key) break; } return local_tb; out: fib_trie_free(local_tb); return NULL; } /* Caller must hold RTNL */ void fib_table_flush_external(struct fib_table *tb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { /* if alias was cloned to local then we just * need to remove the local copy from main */ if (tb->tb_id != fa->tb_id) { hlist_del_rcu(&fa->fa_list); alias_free_mem_rcu(fa); continue; } /* record local slen */ slen = fa->fa_slen; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } } /* Caller must hold RTNL. */ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; struct fib_alias *fa; int found = 0; /* walk trie in reverse order */ for (;;) { unsigned char slen = 0; struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; /* cannot resize the trie vector */ if (IS_TRIE(pn)) break; /* update the suffix to address pulled leaves */ if (pn->slen > pn->pos) update_suffix(pn); /* resize completed node */ pn = resize(t, pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || tb->tb_id != fa->tb_id || (!(fi->fib_flags & RTNH_F_DEAD) && !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } /* Do not flush error routes if network namespace is * not being dismantled */ if (!flush_all && fib_props[fa->fa_type].error) { slen = fa->fa_slen; continue; } fib_notify_alias_delete(net, n->key, &n->leaf, fa, NULL); if (fi->pfsrc_removed) rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); found++; } /* update leaf slen */ n->slen = slen; if (hlist_empty(&n->leaf)) { put_child_root(pn, n->key, NULL); node_free(n); } } pr_debug("trie_flush found=%d\n", found); return found; } /* derived from fib_trie_free */ static void __fib_info_notify_update(struct net *net, struct fib_table *tb, struct nl_info *info) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct fib_alias *fa; for (;;) { struct key_vector *n; if (!(cindex--)) { t_key pkey = pn->key; if (IS_TRIE(pn)) break; pn = node_parent(pn); cindex = get_index(pkey, pn); continue; } /* grab the next available node */ n = get_child(pn, cindex); if (!n) continue; if (IS_TNODE(n)) { /* record pn and cindex for leaf walking */ pn = n; cindex = 1ul << n->bits; continue; } hlist_for_each_entry(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) continue; rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, KEYLENGTH - fa->fa_slen, tb->tb_id, info, NLM_F_REPLACE); } } } void fib_info_notify_update(struct net *net, struct nl_info *info) { unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist, lockdep_rtnl_is_held()) __fib_info_notify_update(net, tb, info); } } static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib_alias *fa; int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (!fi) continue; /* local and main table can share the same trie, * so don't notify twice for the same entry. */ if (tb->tb_id != fa->tb_id) continue; if (fa->fa_slen == last_slen) continue; last_slen = fa->fa_slen; err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; } return 0; } static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { err = fib_leaf_notify(l, tb, nb, extack); if (err) return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } return 0; } int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { unsigned int h; int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { err = fib_table_notify(tb, nb, extack); if (err) return err; } } return 0; } static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; if (tb->tb_data == tb->__data) free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } void fib_free_table(struct fib_table *tb) { call_rcu(&tb->rcu, __trie_free_rcu); } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; if (filter->filter_set || !filter->dump_exceptions || !filter->dump_routes) flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; if (i < s_i) goto next; i_fa = 0; if (tb->tb_id != fa->tb_id) goto next; if (filter->filter_set) { if (filter->rt_type && fa->fa_type != filter->rt_type) goto next; if ((filter->protocol && fi->fib_protocol != filter->protocol)) goto next; if (filter->dev && !fib_info_nh_uses_dev(fi, filter->dev)) goto next; } if (filter->dump_routes) { if (!s_fa) { struct fib_rt_info fri; fri.fi = fi; fri.tb_id = tb->tb_id; fri.dst = xkey; fri.dst_len = KEYLENGTH - fa->fa_slen; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } i_fa++; } if (filter->dump_exceptions) { err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, &i_fa, s_fa, flags); if (err < 0) goto stop; } next: i++; } cb->args[4] = i; return skb->len; stop: cb->args[4] = i; cb->args[5] = i_fa; return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ int count = cb->args[2]; t_key key = cb->args[3]; /* First time here, count and key are both always 0. Count > 0 * and key == 0 means the dump has wrapped around and we are done. */ if (count && !key) return 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; return err; } ++count; key = l->key + 1; memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); /* stop loop if key wrapped back to 0 */ if (key < l->key) break; } cb->args[3] = key; cb->args[2] = count; return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; size_t sz = sizeof(*tb); if (!alias) sz += sizeof(struct trie); tb = kzalloc(sz, GFP_KERNEL); if (!tb) return NULL; tb->tb_id = id; tb->tb_num_default = 0; tb->tb_data = (alias ? alias->__data : tb->__data); if (alias) return tb; t = (struct trie *) tb->tb_data; t->kv[0].pos = KEYLENGTH; t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { kfree(tb); tb = NULL; } #endif return tb; } #ifdef CONFIG_PROC_FS /* Depth first Trie walk iterator */ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct key_vector *tnode; unsigned int index; unsigned int depth; }; static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; struct key_vector *pn = iter->tnode; t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); while (!IS_TRIE(pn)) { while (cindex < child_length(pn)) { struct key_vector *n = get_child_rcu(pn, cindex++); if (!n) continue; if (IS_LEAF(n)) { iter->tnode = pn; iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } return n; } /* Current node exhausted, pop back up */ pkey = pn->key; pn = node_parent_rcu(pn); cindex = get_index(pkey, pn) + 1; --iter->depth; } /* record root node so further searches know we are done */ iter->tnode = pn; iter->index = 0; return NULL; } static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { struct key_vector *n, *pn; if (!t) return NULL; pn = t->kv; n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; if (IS_TNODE(n)) { iter->tnode = n; iter->index = 0; iter->depth = 1; } else { iter->tnode = pn; iter->index = 0; iter->depth = 0; } return n; } static void trie_collect_stats(struct trie *t, struct trie_stat *s) { struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); } /* * This outputs /proc/net/fib_triestats */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; else avdepth = 0; seq_printf(seq, "\tAver depth: %u.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) max--; pointers = 0; for (i = 1; i < max; i++) if (stat->nodesizes[i] != 0) { seq_printf(seq, " %u: %u", i, stat->nodesizes[i]); pointers += (1<<i) * stat->nodesizes[i]; } seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, const struct trie_use_stats __percpu *stats) { struct trie_use_stats s = { 0 }; int cpu; /* loop through all of the CPUs and gather up the stats */ for_each_possible_cpu(cpu) { const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); s.gets += pcpu->gets; s.backtrack += pcpu->backtrack; s.semantic_match_passed += pcpu->semantic_match_passed; s.semantic_match_miss += pcpu->semantic_match_miss; s.null_node_hit += pcpu->null_node_hit; s.resize_node_skipped += pcpu->resize_node_skipped; } seq_printf(seq, "\nCounters:\n---------\n"); seq_printf(seq, "gets = %u\n", s.gets); seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", s.semantic_match_passed); seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); seq_printf(seq, "null node hit= %u\n", s.null_node_hit); seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) { if (tb->tb_id == RT_TABLE_LOCAL) seq_puts(seq, "Local:\n"); else if (tb->tb_id == RT_TABLE_MAIN) seq_puts(seq, "Main:\n"); else seq_printf(seq, "Id %d:\n", tb->tb_id); } static int fib_triestat_seq_show(struct seq_file *seq, void *v) { struct net *net = seq->private; unsigned int h; seq_printf(seq, "Basic info: size of leaf:" " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct trie *t = (struct trie *) tb->tb_data; struct trie_stat stat; if (!t) continue; fib_table_print(seq, tb); trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS trie_show_usage(seq, t->stats); #endif } cond_resched_rcu(); } rcu_read_unlock(); return 0; } static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); loff_t idx = 0; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); n; n = fib_trie_get_next(iter)) if (pos == idx++) { iter->tb = tb; return n; } } } return NULL; } static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return fib_trie_get_idx(seq, *pos); } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; struct key_vector *n; ++*pos; /* next node in same table */ n = fib_trie_get_next(iter); if (n) return n; /* walk rest of this hash chain */ h = tb->tb_id & (FIB_TABLE_HASHSZ - 1); while ((tb_node = rcu_dereference(hlist_next_rcu(&tb->tb_hlist)))) { tb = hlist_entry(tb_node, struct fib_table, tb_hlist); n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } /* new hash chain */ while (++h < FIB_TABLE_HASHSZ) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); if (n) goto found; } } return NULL; found: iter->tb = tb; return n; } static void fib_trie_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void seq_indent(struct seq_file *seq, int n) { while (n-- > 0) seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) { switch (s) { case RT_SCOPE_UNIVERSE: return "universe"; case RT_SCOPE_SITE: return "site"; case RT_SCOPE_LINK: return "link"; case RT_SCOPE_HOST: return "host"; case RT_SCOPE_NOWHERE: return "nowhere"; default: snprintf(buf, len, "scope=%d", s); return buf; } } static const char *const rtn_type_names[__RTN_MAX] = { [RTN_UNSPEC] = "UNSPEC", [RTN_UNICAST] = "UNICAST", [RTN_LOCAL] = "LOCAL", [RTN_BROADCAST] = "BROADCAST", [RTN_ANYCAST] = "ANYCAST", [RTN_MULTICAST] = "MULTICAST", [RTN_BLACKHOLE] = "BLACKHOLE", [RTN_UNREACHABLE] = "UNREACHABLE", [RTN_PROHIBIT] = "PROHIBIT", [RTN_THROW] = "THROW", [RTN_NAT] = "NAT", [RTN_XRESOLVE] = "XRESOLVE", }; static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; snprintf(buf, len, "type %u", t); return buf; } /* Pretty print the trie */ static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; struct key_vector *n = v; if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { __be32 prf = htonl(n->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, tn_info(n)->full_children, tn_info(n)->empty_children); } else { __be32 val = htonl(n->key); struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { char buf1[32], buf2[32]; seq_indent(seq, iter->depth + 1); seq_printf(seq, " /%zu %s %s", KEYLENGTH - fa->fa_slen, rtn_scope(buf1, sizeof(buf1), fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); if (fa->fa_dscp) seq_printf(seq, " tos=%d", inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } return 0; } static const struct seq_operations fib_trie_seq_ops = { .start = fib_trie_seq_start, .next = fib_trie_seq_next, .stop = fib_trie_seq_stop, .show = fib_trie_seq_show, }; struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; struct key_vector *tnode; loff_t pos; t_key key; }; static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { struct key_vector *l, **tp = &iter->tnode; t_key key; /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { key = iter->key; } else { iter->pos = 1; key = 0; } pos -= iter->pos; while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; l = NULL; /* handle unlikely case of a key wrap */ if (!key) break; } if (l) iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ return l; } static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; struct trie *t; rcu_read_lock(); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; iter->main_tb = tb; t = (struct trie *)tb->tb_data; iter->tnode = t->kv; if (*pos != 0) return fib_route_get_idx(iter, *pos); iter->pos = 0; iter->key = KEY_MAX; return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; t_key key = iter->key + 1; ++*pos; /* only allow key of 0 for start of sequence */ if ((v == SEQ_START_TOKEN) || key) l = leaf_walk_rcu(&iter->tnode, key); if (l) { iter->key = l->key; iter->pos++; } else { iter->pos = 0; } return l; } static void fib_route_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; if (fi) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); if (nhc->nhc_gw.ipv4) flags |= RTF_GATEWAY; } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; return flags; } /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) { struct fib_route_iter *iter = seq->private; struct fib_table *tb = iter->main_tb; struct fib_alias *fa; struct key_vector *l = v; __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" "\tWindow\tIRTT"); return 0; } prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); if ((fa->fa_type == RTN_BROADCAST) || (fa->fa_type == RTN_MULTICAST)) continue; if (fa->tb_id != tb->tb_id) continue; seq_setwidth(seq, 127); if (fi) { struct fib_nh_common *nhc = fib_info_nhc(fi, 0); __be32 gw = 0; if (nhc->nhc_gw_family == AF_INET) gw = nhc->nhc_gw.ipv4; seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" "%d\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } seq_pad(seq, '\n'); } return 0; } static const struct seq_operations fib_route_seq_ops = { .start = fib_route_seq_start, .next = fib_route_seq_next, .stop = fib_route_seq_stop, .show = fib_route_seq_show, }; int __net_init fib_proc_init(struct net *net) { if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, sizeof(struct fib_trie_iter))) goto out1; if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, fib_triestat_seq_show, NULL)) goto out2; if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, sizeof(struct fib_route_iter))) goto out3; return 0; out3: remove_proc_entry("fib_triestat", net->proc_net); out2: remove_proc_entry("fib_trie", net->proc_net); out1: return -ENOMEM; } void __net_exit fib_proc_exit(struct net *net) { remove_proc_entry("fib_trie", net->proc_net); remove_proc_entry("fib_triestat", net->proc_net); remove_proc_entry("route", net->proc_net); } #endif /* CONFIG_PROC_FS */ |
3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_INTERNAL_H #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> #include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION int blk_crypto_sysfs_register(struct gendisk *disk); void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2); static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), bio->bi_crypt_context); } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(bio->bi_crypt_context, bio->bi_iter.bi_size, req->crypt_ctx); } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), next->crypt_ctx); } static inline void blk_crypto_rq_set_defaults(struct request *rq) { rq->crypt_ctx = NULL; rq->crypt_keyslot = NULL; } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return rq->crypt_ctx; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return rq->crypt_keyslot; } blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key); bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) { } static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return true; } static inline void blk_crypto_rq_set_defaults(struct request *rq) { } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return false; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) { if (bio_has_crypt_ctx(bio)) __bio_crypt_advance(bio, bytes); } void __bio_crypt_free_ctx(struct bio *bio); static inline void bio_crypt_free_ctx(struct bio *bio) { if (bio_has_crypt_ctx(bio)) __bio_crypt_free_ctx(bio); } static inline void bio_crypt_do_front_merge(struct request *rq, struct bio *bio) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (bio_has_crypt_ctx(bio)) memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, sizeof(rq->crypt_ctx->bc_dun)); #endif } bool __blk_crypto_bio_prep(struct bio **bio_ptr); static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) { if (bio_has_crypt_ctx(*bio_ptr)) return __blk_crypto_bio_prep(bio_ptr); return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } void __blk_crypto_rq_put_keyslot(struct request *rq); static inline void blk_crypto_rq_put_keyslot(struct request *rq) { if (blk_crypto_rq_has_keyslot(rq)) __blk_crypto_rq_put_keyslot(rq); } void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) __blk_crypto_free_request(rq); } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask); /** * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio * is inserted * @rq: The request to prepare * @bio: The first bio being inserted into the request * @gfp_mask: Memory allocation flags * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (bio_has_crypt_ctx(bio)) return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ static inline int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { pr_warn_once("crypto API fallback is disabled\n"); return -ENOPKG; } static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) { pr_warn_once("crypto API fallback disabled; failing request.\n"); (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; return false; } static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { return 0; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ |
1729 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 | /* * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H #include <uapi/linux/perf_event.h> #include <uapi/linux/bpf_perf_event.h> /* * Kernel-internal data types and definitions: */ #ifdef CONFIG_PERF_EVENTS # include <asm/perf_event.h> # include <asm/local64.h> #endif #define PERF_GUEST_ACTIVE 0x01 #define PERF_GUEST_USER 0x02 struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); }; #ifdef CONFIG_HAVE_HW_BREAKPOINT #include <linux/rhashtable-types.h> #include <asm/hw_breakpoint.h> #endif #include <linux/list.h> #include <linux/mutex.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/fs.h> #include <linux/pid_namespace.h> #include <linux/workqueue.h> #include <linux/ftrace.h> #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> #include <linux/cgroup.h> #include <linux/refcount.h> #include <linux/security.h> #include <linux/static_call.h> #include <linux/lockdep.h> #include <asm/local.h> struct perf_callchain_entry { __u64 nr; __u64 ip[]; /* /proc/sys/kernel/perf_event_max_stack */ }; struct perf_callchain_entry_ctx { struct perf_callchain_entry *entry; u32 max_stack; u32 nr; short contexts; bool contexts_maxed; }; typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long off, unsigned long len); struct perf_raw_frag { union { struct perf_raw_frag *next; unsigned long pad; }; perf_copy_f copy; void *data; u32 size; } __packed; struct perf_raw_record { struct perf_raw_frag frag; u32 size; }; static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag) { return frag->pad < sizeof(u64); } /* * branch stack layout: * nr: number of taken branches stored in entries[] * hw_idx: The low level index of raw branch records * for the most recent branch. * -1ULL means invalid/unknown. * * Note that nr can vary from sample to sample * branches (to, from) are stored from most recent * to least recent, i.e., entries[0] contains the most * recent branch. * The entries[] is an abstraction of raw branch records, * which may not be stored in age order in HW, e.g. Intel LBR. * The hw_idx is to expose the low level index of raw * branch record for the most recent branch aka entries[0]. * The hw_idx index is between -1 (unknown) and max depth, * which can be retrieved in /sys/devices/cpu/caps/branches. * For the architectures whose raw branch records are * already stored in age order, the hw_idx should be 0. */ struct perf_branch_stack { __u64 nr; __u64 hw_idx; struct perf_branch_entry entries[]; }; struct task_struct; /* * extra PMU register associated with an event */ struct hw_perf_event_extra { u64 config; /* register value */ unsigned int reg; /* register address or index */ int alloc; /* extra register already allocated */ int idx; /* index in shared_regs->regs[] */ }; /** * hw_perf_event::flag values * * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); /** * struct hw_perf_event - performance event hardware details: */ struct hw_perf_event { #ifdef CONFIG_PERF_EVENTS union { struct { /* hardware */ u64 config; u64 last_tag; unsigned long config_base; unsigned long event_base; int event_base_rdpmc; int idx; int last_cpu; int flags; struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; struct { /* aux / Intel-PT */ u64 aux_config; /* * For AUX area events, aux_paused cannot be a state * flag because it can be updated asynchronously to * state. */ unsigned int aux_paused; }; struct { /* software */ struct hrtimer hrtimer; }; struct { /* tracepoint */ /* for tp_event->class */ struct list_head tp_list; }; struct { /* amd_power */ u64 pwr_acc; u64 ptsc; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* * Crufty hack to avoid the chicken and egg * problem hw_breakpoint has with context * creation and event initalization. */ struct arch_hw_breakpoint info; struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ u8 iommu_bank; u8 iommu_cntr; u16 padding; u64 conf; u64 conf1; }; }; /* * If the event is a per task event, this will point to the task in * question. See the comment in perf_event_alloc(). */ struct task_struct *target; /* * PMU would store hardware filter configuration * here. */ void *addr_filters; /* Last sync'ed generation of filters */ unsigned long addr_filters_gen; /* * hw_perf_event::state flags; used to track the PERF_EF_* state. */ #define PERF_HES_STOPPED 0x01 /* the counter is stopped */ #define PERF_HES_UPTODATE 0x02 /* event->count up-to-date */ #define PERF_HES_ARCH 0x04 int state; /* * The last observed hardware counter value, updated with a * local64_cmpxchg() such that pmu::read() can be called nested. */ local64_t prev_count; /* * The period to start the next sample with. */ u64 sample_period; union { struct { /* Sampling */ /* * The period we started this sample with. */ u64 last_period; /* * However much is left of the current period; * note that this is a full 64bit value and * allows for generation of periods longer * than hardware might allow. */ local64_t period_left; }; struct { /* Topdown events counting for context switch */ u64 saved_metric; u64 saved_slots; }; }; /* * State for throttling the event, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 interrupts_seq; u64 interrupts; /* * State for freq target events, see __perf_event_overflow() and * perf_adjust_freq_unthr_context(). */ u64 freq_time_stamp; u64 freq_count_stamp; #endif }; struct perf_event; struct perf_event_pmu_context; /* * Common implementation detail of pmu::{start,commit,cancel}_txn */ #define PERF_PMU_TXN_ADD 0x1 /* txn to add/schedule event on PMU */ #define PERF_PMU_TXN_READ 0x2 /* txn to read event group from PMU */ /** * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x0001 #define PERF_PMU_CAP_NO_NMI 0x0002 #define PERF_PMU_CAP_AUX_NO_SG 0x0004 #define PERF_PMU_CAP_EXTENDED_REGS 0x0008 #define PERF_PMU_CAP_EXCLUSIVE 0x0010 #define PERF_PMU_CAP_ITRACE 0x0020 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 /** * pmu::scope */ enum perf_pmu_scope { PERF_PMU_SCOPE_NONE = 0, PERF_PMU_SCOPE_CORE, PERF_PMU_SCOPE_DIE, PERF_PMU_SCOPE_CLUSTER, PERF_PMU_SCOPE_PKG, PERF_PMU_SCOPE_SYS_WIDE, PERF_PMU_MAX_SCOPE, }; struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) /** * struct pmu - generic performance monitoring unit */ struct pmu { struct list_head entry; struct module *module; struct device *dev; struct device *parent; const struct attribute_group **attr_groups; const struct attribute_group **attr_update; const char *name; int type; /* * various common per-pmu feature flags */ int capabilities; /* * PMU scope */ unsigned int scope; int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; /* number of address filters this PMU can do */ unsigned int nr_addr_filters; /* * Fully disable/enable this PMU, can be used to protect from the PMI * as well as for lazy/batch writing of the MSRs. */ void (*pmu_enable) (struct pmu *pmu); /* optional */ void (*pmu_disable) (struct pmu *pmu); /* optional */ /* * Try and initialize the event for this PMU. * * Returns: * -ENOENT -- @event is not for this PMU * * -ENODEV -- @event is for this PMU but PMU not present * -EBUSY -- @event is for this PMU but PMU temporarily unavailable * -EINVAL -- @event is for this PMU but @event is not valid * -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported * -EACCES -- @event is for this PMU, @event is valid, but no privileges * * 0 -- @event is for this PMU and valid * * Other error return values are allowed. */ int (*event_init) (struct perf_event *event); /* * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are * matching hw_perf_event::state flags. */ #define PERF_EF_START 0x01 /* start the counter when adding */ #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ /* * Adds/Removes a counter to/from the PMU, can be done inside a * transaction, see the ->*_txn() methods. * * The add/del callbacks will reserve all hardware resources required * to service the event, this includes any counter constraint * scheduling etc. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on. * * ->add() called without PERF_EF_START should result in the same state * as ->add() followed by ->stop(). * * ->del() must always PERF_EF_UPDATE stop an event. If it calls * ->stop() that must deal with already being stopped without * PERF_EF_UPDATE. */ int (*add) (struct perf_event *event, int flags); void (*del) (struct perf_event *event, int flags); /* * Starts/Stops a counter present on the PMU. * * The PMI handler should stop the counter when perf_event_overflow() * returns !0. ->start() will be used to continue. * * Also used to change the sample period. * * Called with IRQs disabled and the PMU disabled on the CPU the event * is on -- will be called from NMI context with the PMU generates * NMIs. * * ->stop() with PERF_EF_UPDATE will read the counter and update * period/count values like ->read() would. * * ->start() with PERF_EF_RELOAD will reprogram the counter * value, must be preceded by a ->stop() with PERF_EF_UPDATE. * * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with * PERF_EF_RESUME. * * ->start() with PERF_EF_RESUME will start as simply as possible but * only if the counter is not otherwise stopped. Will not overlap * another ->start() with PERF_EF_RESUME nor ->stop() with * PERF_EF_PAUSE. * * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other * ->stop()/->start() invocations, just not itself. */ void (*start) (struct perf_event *event, int flags); void (*stop) (struct perf_event *event, int flags); /* * Updates the counter value of the event. * * For sampling capable PMUs this will also update the software period * hw_perf_event::period_left field. */ void (*read) (struct perf_event *event); /* * Group events scheduling is treated as a transaction, add * group events as a whole and perform one schedulability test. * If the test fails, roll back the whole group * * Start the transaction, after this ->add() doesn't need to * do schedulability tests. * * Optional. */ void (*start_txn) (struct pmu *pmu, unsigned int txn_flags); /* * If ->start_txn() disabled the ->add() schedulability test * then ->commit_txn() is required to perform one. On success * the transaction is closed. On error the transaction is kept * open until ->cancel_txn() is called. * * Optional. */ int (*commit_txn) (struct pmu *pmu); /* * Will cancel the transaction, assumes ->del() is called * for each successful ->add() during the transaction. * * Optional. */ void (*cancel_txn) (struct pmu *pmu); /* * Will return the value for perf_event_mmap_page::index for this event, * if no implementation is provided it will default to 0 (see * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ /* * context-switches callback */ void (*sched_task) (struct perf_event_pmu_context *pmu_ctx, bool sched_in); /* * Kmem cache of PMU specific data */ struct kmem_cache *task_ctx_cache; /* * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data) * can be synchronized using this function. See Intel LBR callstack support * implementation and Perf core context switch handling callbacks for usage * examples. */ void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc, struct perf_event_pmu_context *next_epc); /* optional */ /* * Set up pmu-private data structures for an AUX area */ void *(*setup_aux) (struct perf_event *event, void **pages, int nr_pages, bool overwrite); /* optional */ /* * Free pmu-private AUX data structures */ void (*free_aux) (void *aux); /* optional */ /* * Take a snapshot of the AUX buffer without touching the event * state, so that preempting ->start()/->stop() callbacks does * not interfere with their logic. Called in PMI context. * * Returns the size of AUX data copied to the output handle. * * Optional. */ long (*snapshot_aux) (struct perf_event *event, struct perf_output_handle *handle, unsigned long size); /* * Validate address range filters: make sure the HW supports the * requested configuration and number of filters; return 0 if the * supplied filters are valid, -errno otherwise. * * Runs in the context of the ioctl()ing process and is not serialized * with the rest of the PMU callbacks. */ int (*addr_filters_validate) (struct list_head *filters); /* optional */ /* * Synchronize address range filter configuration: * translate hw-agnostic filters into hardware configuration in * event::hw::addr_filters. * * Runs as a part of filter sync sequence that is done in ->start() * callback by calling perf_event_addr_filters_sync(). * * May (and should) traverse event::addr_filters::list, for which its * caller provides necessary serialization. */ void (*addr_filters_sync) (struct perf_event *event); /* optional */ /* * Check if event can be used for aux_output purposes for * events of this PMU. * * Runs from perf_event_open(). Should return 0 for "no match" * or non-zero for "match". */ int (*aux_output_match) (struct perf_event *event); /* optional */ /* * Skip programming this PMU on the given CPU. Typically needed for * big.LITTLE things. */ bool (*filter) (struct pmu *pmu, int cpu); /* optional */ /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 value); /* optional */ }; enum perf_addr_filter_action_t { PERF_ADDR_FILTER_ACTION_STOP = 0, PERF_ADDR_FILTER_ACTION_START, PERF_ADDR_FILTER_ACTION_FILTER, }; /** * struct perf_addr_filter - address range filter definition * @entry: event's filter list linkage * @path: object file's path for file-based filters * @offset: filter range offset * @size: filter range size (size==0 means single address trigger) * @action: filter/start/stop * * This is a hardware-agnostic filter configuration as specified by the user. */ struct perf_addr_filter { struct list_head entry; struct path path; unsigned long offset; unsigned long size; enum perf_addr_filter_action_t action; }; /** * struct perf_addr_filters_head - container for address range filters * @list: list of filters for this event * @lock: spinlock that serializes accesses to the @list and event's * (and its children's) filter generations. * @nr_file_filters: number of file-based filters * * A child event will use parent's @list (and therefore @lock), so they are * bundled together; see perf_event_addr_filters(). */ struct perf_addr_filters_head { struct list_head list; raw_spinlock_t lock; unsigned int nr_file_filters; }; struct perf_addr_filter_range { unsigned long start; unsigned long size; }; /** * enum perf_event_state - the states of an event: */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -4, PERF_EVENT_STATE_EXIT = -3, PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, PERF_EVENT_STATE_ACTIVE = 1, }; struct file; struct perf_sample_data; typedef void (*perf_overflow_handler_t)(struct perf_event *, struct perf_sample_data *, struct pt_regs *regs); /* * Event capabilities. For event_caps and groups caps. * * PERF_EV_CAP_SOFTWARE: Is a software event. * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read * from any CPU in the package where it is active. * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) #define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) struct swevent_hlist { struct hlist_head heads[SWEVENT_HLIST_SIZE]; struct rcu_head rcu_head; }; #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 #define PERF_ATTACH_CHILD 0x40 struct bpf_prog; struct perf_cgroup; struct perf_buffer; struct pmu_event_list { raw_spinlock_t lock; struct list_head list; }; /* * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex * as such iteration must hold either lock. However, since ctx->lock is an IRQ * safe lock, and is only held by the CPU doing the modification, having IRQs * disabled is sufficient since it will hold-off the IPIs. */ #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) #endif #define for_each_sibling_event(sibling, event) \ lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) /** * struct perf_event - performance event kernel representation: */ struct perf_event { #ifdef CONFIG_PERF_EVENTS /* * entry onto perf_event_context::event_list; * modifications require ctx->lock * RCU safe iterations. */ struct list_head event_entry; /* * Locked for modification by both ctx->mutex and ctx->lock; holding * either sufficies for read. */ struct list_head sibling_list; struct list_head active_list; /* * Node on the pinned or flexible tree located at the event context; */ struct rb_node group_node; u64 group_index; /* * We need storage to track the entries in perf_pmu_migrate_context; we * cannot use the event_entry because of RCU and we want to keep the * group in tact which avoids us using the other two entries. */ struct list_head migrate_entry; struct hlist_node hlist_entry; struct list_head active_entry; int nr_siblings; /* Not serialized. Only written during event initialization. */ int event_caps; /* The cumulative AND of all event_caps for events in this group. */ int group_caps; unsigned int group_generation; struct perf_event *group_leader; /* * event->pmu will always point to pmu in which this event belongs. * Whereas event->pmu_ctx->pmu may point to other pmu when group of * different pmu events is created. */ struct pmu *pmu; void *pmu_private; enum perf_event_state state; unsigned int attach_state; local64_t count; atomic64_t child_count; /* * These are the total time in nanoseconds that the event * has been enabled (i.e. eligible to run, and the task has * been scheduled in, if this is a per-task event) * and running (scheduled onto the CPU), respectively. */ u64 total_time_enabled; u64 total_time_running; u64 tstamp; struct perf_event_attr attr; u16 header_size; u16 id_header_size; u16 read_size; struct hw_perf_event hw; struct perf_event_context *ctx; /* * event->pmu_ctx points to perf_event_pmu_context in which the event * is added. This pmu_ctx can be of other pmu for sw event when that * sw event is part of a group which also contains non-sw events. */ struct perf_event_pmu_context *pmu_ctx; atomic_long_t refcount; /* * These accumulate total time (in nanoseconds) that children * events have been enabled and running, respectively. */ atomic64_t child_total_time_enabled; atomic64_t child_total_time_running; /* * Protect attach/detach and child_list: */ struct mutex child_mutex; struct list_head child_list; struct perf_event *parent; int oncpu; int cpu; struct list_head owner_entry; struct task_struct *owner; /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; unsigned long rcu_batches; int rcu_pending; /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; /* delayed work for NMIs and such */ unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; struct rcuwait pending_work_wait; atomic_t event_limit; /* address range filters */ struct perf_addr_filters_head addr_filters; /* vma address array for file-based filders */ struct perf_addr_filter_range *addr_filter_ranges; unsigned long addr_filters_gen; /* for aux_output events */ struct perf_event *aux_event; void (*destroy)(struct perf_event *); struct rcu_head rcu_head; struct pid_namespace *ns; u64 id; atomic64_t lost_samples; u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; struct bpf_prog *prog; u64 bpf_cookie; #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; struct event_filter *filter; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops ftrace_ops; #endif #endif #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; /* cgroup event is attach to */ #endif #ifdef CONFIG_SECURITY void *security; #endif struct list_head sb_list; /* * Certain events gets forwarded to another pmu internally by over- * writing kernel copy of event->attr.type without user being aware * of it. event->orig_type contains original 'type' requested by * user. */ __u32 orig_type; #endif /* CONFIG_PERF_EVENTS */ }; /* * ,-----------------------[1:n]------------------------. * V V * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event * | | * `--[n:1]-> pmu <-[1:n]--' * * * struct perf_event_pmu_context lifetime is refcount based and RCU freed * (similar to perf_event_context). Locking is as if it were a member of * perf_event_context; specifically: * * modification, both: ctx->mutex && ctx->lock * reading, either: ctx->mutex || ctx->lock * * There is one exception to this; namely put_pmu_ctx() isn't always called * with ctx->mutex held; this means that as long as we can guarantee the epc * has events the above rules hold. * * Specificially, sys_perf_event_open()'s group_leader case depends on * ctx->mutex pinning the configuration. Since we hold a reference on * group_leader (through the filedesc) it can't go away, therefore it's * associated pmu_ctx must exist and cannot change due to ctx->mutex. * * perf_event holds a refcount on perf_event_context * perf_event holds a refcount on perf_event_pmu_context */ struct perf_event_pmu_context { struct pmu *pmu; struct perf_event_context *ctx; struct list_head pmu_ctx_entry; struct list_head pinned_active; struct list_head flexible_active; /* Used to avoid freeing per-cpu perf_event_pmu_context */ unsigned int embedded : 1; unsigned int nr_events; unsigned int nr_cgroups; unsigned int nr_freq; atomic_t refcount; /* event <-> epc */ struct rcu_head rcu_head; void *task_ctx_data; /* pmu specific data */ /* * Set when one or more (plausibly active) event can't be scheduled * due to pmu overcommit or pmu constraints, except tolerant to * events not necessary to be active due to scheduling constraints, * such as cgroups. */ int rotate_necessary; }; static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc) { return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active); } struct perf_event_groups { struct rb_root tree; u64 index; }; /** * struct perf_event_context - event context structure * * Used as a container for task events and CPU events as well: */ struct perf_event_context { /* * Protect the states of the events in the list, * nr_active, and the list: */ raw_spinlock_t lock; /* * Protect the list of events. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ struct mutex mutex; struct list_head pmu_ctx_list; struct perf_event_groups pinned_groups; struct perf_event_groups flexible_groups; struct list_head event_list; int nr_events; int nr_user; int is_active; int nr_task_data; int nr_stat; int nr_freq; int rotate_disable; refcount_t refcount; /* event <-> ctx */ struct task_struct *task; /* * Context clock, runs when context enabled. */ u64 time; u64 timestamp; u64 timeoffset; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ struct perf_event_context *parent_ctx; u64 parent_gen; u64 generation; int pin_count; #ifdef CONFIG_CGROUP_PERF int nr_cgroups; /* cgroup evts */ #endif struct rcu_head rcu_head; /* * The count of events for which using the switch-out fast path * should be avoided. * * Sum (event->pending_work + events with * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { struct perf_event_pmu_context epc; struct perf_event_pmu_context *task_epc; struct list_head sched_cb_entry; int sched_cb_usage; int active_oncpu; int exclusive; raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; unsigned int hrtimer_active; }; /** * struct perf_event_cpu_context - per cpu event context structure */ struct perf_cpu_context { struct perf_event_context ctx; struct perf_event_context *task_ctx; int online; #ifdef CONFIG_CGROUP_PERF struct perf_cgroup *cgrp; #endif /* * Per-CPU storage for iterators used in visit_groups_merge. The default * storage is of size 2 to hold the CPU and any CPU event iterators. */ int heap_size; struct perf_event **heap; struct perf_event *heap_default[2]; }; struct perf_output_handle { struct perf_event *event; struct perf_buffer *rb; unsigned long wakeup; unsigned long size; u64 aux_flags; union { void *addr; unsigned long head; }; int page; }; struct bpf_perf_event_data_kern { bpf_user_pt_regs_t *regs; struct perf_sample_data *data; struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF /* * perf_cgroup_info keeps track of time_enabled for a cgroup. * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { u64 time; u64 timestamp; u64 timeoffset; int active; }; struct perf_cgroup { struct cgroup_subsys_state css; struct perf_cgroup_info __percpu *info; }; /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { return container_of(task_css_check(task, perf_event_cgrp_id, ctx ? lockdep_is_held(&ctx->lock) : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ #ifdef CONFIG_PERF_EVENTS extern struct perf_event_context *perf_cpu_task_ctx(void); extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern void perf_event_itrace_started(struct perf_event *event); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); extern void perf_sched_cb_dec(struct pmu *pmu); extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern void perf_pmu_resched(struct pmu *pmu); extern int perf_event_refresh(struct perf_event *event, int refresh); extern void perf_event_update_userpage(struct perf_event *event); extern int perf_event_release_kernel(struct perf_event *event); extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, perf_overflow_handler_t callback, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs); static inline bool branch_sample_no_flags(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; } static inline bool branch_sample_no_cycles(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; } static inline bool branch_sample_type(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; } static inline bool branch_sample_hw_index(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; } static inline bool branch_sample_priv(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; } static inline bool branch_sample_counters(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS; } static inline bool branch_sample_call_stack(const struct perf_event *event) { return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK; } struct perf_sample_data { /* * Fields set by perf_sample_data_init() unconditionally, * group so as to minimize the cachelines touched. */ u64 sample_flags; u64 period; u64 dyn_size; /* * Fields commonly set by __perf_event_header__init_id(), * group so as to minimize the cachelines touched. */ u64 type; struct { u32 pid; u32 tid; } tid_entry; u64 time; u64 id; struct { u32 cpu; u32 reserved; } cpu_entry; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ u64 ip; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 *br_stack_cntr; union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; struct perf_regs regs_user; struct perf_regs regs_intr; u64 stack_user_size; u64 stream_id; u64 cgroup; u64 addr; u64 phys_addr; u64 data_page_size; u64 code_page_size; u64 aux_size; } ____cacheline_aligned; /* default value for data source */ #define PERF_MEM_NA (PERF_MEM_S(OP, NA) |\ PERF_MEM_S(LVL, NA) |\ PERF_MEM_S(SNOOP, NA) |\ PERF_MEM_S(LOCK, NA) |\ PERF_MEM_S(TLB, NA) |\ PERF_MEM_S(LVLNUM, NA)) static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; data->dyn_size = 0; if (addr) { data->addr = addr; data->sample_flags |= PERF_SAMPLE_ADDR; } } static inline void perf_sample_save_callchain(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { int size = 1; data->callchain = perf_callchain(event, regs); size += data->callchain->nr; data->dyn_size += size * sizeof(u64); data->sample_flags |= PERF_SAMPLE_CALLCHAIN; } static inline void perf_sample_save_raw_data(struct perf_sample_data *data, struct perf_raw_record *raw) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; int size; do { sum += frag->size; if (perf_raw_frag_last(frag)) break; frag = frag->next; } while (1); size = round_up(sum + sizeof(u32), sizeof(u64)); raw->size = size - sizeof(u32); frag->pad = raw->size - sum; data->raw = raw; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_RAW; } static inline void perf_sample_save_brstack(struct perf_sample_data *data, struct perf_event *event, struct perf_branch_stack *brs, u64 *brs_cntr) { int size = sizeof(u64); /* nr */ if (branch_sample_hw_index(event)) size += sizeof(u64); size += brs->nr * sizeof(struct perf_branch_entry); /* * The extension space for counters is appended after the * struct perf_branch_stack. It is used to store the occurrences * of events of each branch. */ if (brs_cntr) size += brs->nr * sizeof(u64); data->br_stack = brs; data->br_stack_cntr = brs_cntr; data->dyn_size += size; data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } static inline u32 perf_sample_data_size(struct perf_sample_data *data, struct perf_event *event) { u32 size = sizeof(struct perf_event_header); size += event->header_size + event->id_header_size; size += data->dyn_size; return size; } /* * Clear all bitfields in the perf_branch_entry. * The to and from fields are not cleared because they are * systematically modified by caller. */ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br) { br->mispred = 0; br->predicted = 0; br->in_tx = 0; br->abort = 0; br->cycles = 0; br->type = 0; br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern void perf_prepare_header(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs); extern int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_forward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern void perf_event_output_backward(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); extern int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs); static inline bool is_default_overflow_handler(struct perf_event *event) { perf_overflow_handler_t overflow_handler = event->overflow_handler; if (likely(overflow_handler == perf_event_output_forward)) return true; if (unlikely(overflow_handler == perf_event_output_backward)) return true; return false; } extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event); extern void perf_event__output_id_sample(struct perf_event *event, struct perf_output_handle *handle, struct perf_sample_data *sample); extern void perf_log_lost_samples(struct perf_event *event, u64 lost); static inline bool event_has_any_exclude_flag(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; return attr->exclude_idle || attr->exclude_user || attr->exclude_kernel || attr->exclude_hv || attr->exclude_guest || attr->exclude_host; } static inline bool is_sampling_event(struct perf_event *event) { return event->attr.sample_period != 0; } /* * Return 1 for a software event, 0 for a hardware event */ static inline int is_software_event(struct perf_event *event) { return event->event_caps & PERF_EV_CAP_SOFTWARE; } /* * Return 1 for event in sw context, 0 for event in hw context */ static inline int in_software_context(struct perf_event *event) { return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context; } static inline int is_exclusive_pmu(struct pmu *pmu) { return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; } extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); #ifndef perf_arch_fetch_caller_regs static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { } #endif /* * When generating a perf sample in-line, instead of from an interrupt / * exception, we lack a pt_regs. This is typically used from software events * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints. * * We typically don't need a full set, but (for x86) do require: * - ip for PERF_SAMPLE_IP * - cs for user_mode() tests * - sp for PERF_SAMPLE_CALLCHAIN * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs()) * * NOTE: assumes @regs is otherwise already 0 filled; this is important for * things like PERF_SAMPLE_REGS_INTR. */ static inline void perf_fetch_caller_regs(struct pt_regs *regs) { perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); } static __always_inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { if (static_key_false(&perf_swevent_enabled[event_id])) __perf_sw_event(event_id, nr, regs, addr); } DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); /* * 'Special' version for the scheduler, it hard assumes no recursion, * which is guaranteed by us not actually scheduling inside other swevents * because those disable preemption. */ static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); perf_fetch_caller_regs(regs); ___perf_sw_event(event_id, nr, regs, addr); } extern struct static_key_false perf_sched_events; static __always_inline bool __perf_sw_enabled(int swevt) { return static_key_false(&perf_swevent_enabled[swevt]); } static inline void perf_event_task_migrate(struct task_struct *task) { if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS)) task->sched_migrated = 1; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_in(prev, task); if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) && task->sched_migrated) { __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); task->sched_migrated = 0; } } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES)) __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); #ifdef CONFIG_CGROUP_PERF if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) && perf_cgroup_from_task(prev, NULL) != perf_cgroup_from_task(next, NULL)) __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0); #endif if (static_branch_unlikely(&perf_sched_events)) __perf_event_task_sched_out(prev, next); } extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); static inline unsigned int perf_guest_state(void) { return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); #else static inline unsigned int perf_guest_state(void) { return 0; } static inline unsigned long perf_guest_get_ip(void) { return 0; } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } #endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); extern void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len); /* Callchains */ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); extern void put_callchain_entry(int rctx); extern int sysctl_perf_event_max_stack; extern int sysctl_perf_event_max_contexts_per_stack; static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->contexts; return 0; } else { ctx->contexts_maxed = true; return -1; /* no more room, stop walking the stack */ } } static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip) { if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) { struct perf_callchain_entry *entry = ctx->entry; entry->ip[entry->nr++] = ip; ++ctx->nr; return 0; } else { return -1; /* no more room, stop walking the stack */ } } extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 /* Finer grained perf_event_open(2) access control. */ #define PERF_SECURITY_CPU 1 #define PERF_SECURITY_KERNEL 2 #define PERF_SECURITY_TRACEPOINT 3 static inline int perf_is_paranoid(void) { return sysctl_perf_event_paranoid > -1; } int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > 0 && !perfmon_capable()) return -EACCES; return security_perf_event_open(attr, PERF_SECURITY_CPU); } static inline int perf_allow_tracepoint(struct perf_event_attr *attr) { if (sysctl_perf_event_paranoid > -1 && !perfmon_capable()) return -EPERM; return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task); extern void perf_bp_event(struct perf_event *event, void *data); extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct perf_event *event, struct pt_regs *regs); #ifndef perf_arch_misc_flags # define perf_arch_misc_flags(regs) \ (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) #endif #ifndef perf_arch_bpf_user_pt_regs # define perf_arch_bpf_user_pt_regs(regs) regs #endif #ifndef perf_arch_guest_misc_flags static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) { unsigned long guest_state = perf_guest_state(); if (!(guest_state & PERF_GUEST_ACTIVE)) return 0; if (guest_state & PERF_GUEST_USER) return PERF_RECORD_MISC_GUEST_USER; else return PERF_RECORD_MISC_GUEST_KERNEL; } # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) #endif static inline bool has_branch_stack(struct perf_event *event) { return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } static inline bool needs_branch_stack(struct perf_event *event) { return event->attr.branch_sample_type != 0; } static inline bool has_aux(struct perf_event *event) { return event->pmu->setup_aux; } static inline bool has_aux_action(struct perf_event *event) { return event->attr.aux_sample_size || event->attr.aux_pause || event->attr.aux_resume; } static inline bool is_write_backward(struct perf_event *event) { return !!event->attr.write_backward; } static inline bool has_addr_filter(struct perf_event *event) { return event->pmu->nr_addr_filters; } /* * An inherited event uses parent's filters */ static inline struct perf_addr_filters_head * perf_event_addr_filters(struct perf_event *event) { struct perf_addr_filters_head *ifh = &event->addr_filters; if (event->parent) ifh = &event->parent->addr_filters; return ifh; } static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) { /* Only the parent has fasync state */ if (event->parent) event = event->parent; return &event->fasync; } extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern int perf_output_begin(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len); extern long perf_output_copy_aux(struct perf_output_handle *aux_handle, struct perf_output_handle *handle, unsigned long from, unsigned long to); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern u64 perf_swevent_set_period(struct perf_event *event); extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } static inline void * perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_migrate(struct task_struct *task) { } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } static inline int perf_event_init_task(struct task_struct *child, u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } static inline const struct perf_event *perf_get_event(struct file *file) { return ERR_PTR(-EINVAL); } static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); } static inline int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { return -EINVAL; } static inline void perf_event_print_debug(void) { } static inline int perf_event_task_disable(void) { return -EINVAL; } static inline int perf_event_task_enable(void) { return -EINVAL; } static inline int perf_event_refresh(struct perf_event *event, int refresh) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } static inline void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_text_poke(const void *addr, const void *old_bytes, size_t old_len, const void *new_bytes, size_t new_len) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } static inline u64 perf_swevent_set_period(struct perf_event *event) { return 0; } static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } static inline int perf_event_release_kernel(struct perf_event *event) { return 0; } static inline int perf_event_period(struct perf_event *event, u64 value) { return -EINVAL; } static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern void perf_restore_debug_store(void); #else static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) struct perf_pmu_events_attr { struct device_attribute attr; u64 id; const char *event_str; }; struct perf_pmu_events_ht_attr { struct device_attribute attr; u64 id; const char *event_str_ht; const char *event_str_noht; }; struct perf_pmu_events_hybrid_attr { struct device_attribute attr; u64 id; const char *event_str; u64 pmu_type; }; struct perf_pmu_format_hybrid_attr { struct device_attribute attr; u64 pmu_type; }; ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #define PMU_EVENT_ATTR(_name, _var, _id, _show) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, \ }; #define PMU_EVENT_ATTR_STRING(_name, _var, _str) \ static struct perf_pmu_events_attr _var = { \ .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \ .id = 0, \ .event_str = _str, \ }; #define PMU_EVENT_ATTR_ID(_name, _show, _id) \ (&((struct perf_pmu_events_attr[]) { \ { .attr = __ATTR(_name, 0444, _show, NULL), \ .id = _id, } \ })[0].attr.attr) #define PMU_FORMAT_ATTR_SHOW(_name, _format) \ static ssize_t \ _name##_show(struct device *dev, \ struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ #define PMU_FORMAT_ATTR(_name, _format) \ PMU_FORMAT_ATTR_SHOW(_name, _format) \ \ static struct device_attribute format_attr_##_name = __ATTR_RO(_name) /* Performance counter hotplug functions */ #ifdef CONFIG_PERF_EVENTS int perf_event_init_cpu(unsigned int cpu); int perf_event_exit_cpu(unsigned int cpu); #else #define perf_event_init_cpu NULL #define perf_event_exit_cpu NULL #endif extern void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); /* * Snapshot branch stack on software events. * * Branch stack can be very useful in understanding software events. For * example, when a long function, e.g. sys_perf_event_open, returns an * errno, it is not obvious why the function failed. Branch stack could * provide very helpful information in this type of scenarios. * * On software event, it is necessary to stop the hardware branch recorder * fast. Otherwise, the hardware register/buffer will be flushed with * entries of the triggering event. Therefore, static call is used to * stop the hardware recorder. */ /* * cnt is the number of entries allocated for entries. * Return number of entries copied to . */ typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries, unsigned int cnt); DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t); #ifndef PERF_NEEDS_LOPWR_CB static inline void perf_lopwr_cb(bool mode) { } #endif #endif /* _LINUX_PERF_EVENT_H */ |
4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 | /* * llc_c_ac.c - actions performed during connection state transition. * * Description: * Functions in this module are implementation of connection component actions * Details of actions can be found in IEEE-802.2 standard document. * All functions have one connection and one event as input argument. All of * them return 0 On success and 1 otherwise. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <net/llc_conn.h> #include <net/llc_sap.h> #include <net/sock.h> #include <net/llc_c_ev.h> #include <net/llc_c_ac.h> #include <net/llc_c_st.h> #include <net/llc_pdu.h> #include <net/llc.h> static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb); static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev); static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb); static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb); #define INCORRECT 0 int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->remote_busy_flag) { u8 nr; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc->remote_busy_flag = 0; del_timer(&llc->busy_state_timer.timer); nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); } return 0; } int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->ind_prim = LLC_CONN_PRIM; return 0; } int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->cfm_prim = LLC_CONN_PRIM; return 0; } static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->cfm_prim = LLC_DATA_PRIM; return 0; } int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb) { llc_conn_rtn_pdu(sk, skb); return 0; } int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); u8 reason = 0; int rc = 0; if (ev->type == LLC_CONN_EV_TYPE_PDU) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM) reason = LLC_DISC_REASON_RX_DM_RSP_PDU; else if (LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC) reason = LLC_DISC_REASON_RX_DISC_CMD_PDU; } else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR) reason = LLC_DISC_REASON_ACK_TMR_EXP; else rc = -EINVAL; if (!rc) { ev->reason = reason; ev->ind_prim = LLC_DISC_PRIM; } return rc; } int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->reason = ev->status; ev->cfm_prim = LLC_DISC_PRIM; return 0; } int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb) { u8 reason = 0; int rc = 1; struct llc_conn_state_ev *ev = llc_conn_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); struct llc_sock *llc = llc_sk(sk); switch (ev->type) { case LLC_CONN_EV_TYPE_PDU: if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) { reason = LLC_RESET_REASON_LOCAL; rc = 0; } else if (LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) { reason = LLC_RESET_REASON_REMOTE; rc = 0; } break; case LLC_CONN_EV_TYPE_ACK_TMR: case LLC_CONN_EV_TYPE_P_TMR: case LLC_CONN_EV_TYPE_REJ_TMR: case LLC_CONN_EV_TYPE_BUSY_TMR: if (llc->retry_count > llc->n2) { reason = LLC_RESET_REASON_LOCAL; rc = 0; } break; } if (!rc) { ev->reason = reason; ev->ind_prim = LLC_RESET_PRIM; } return rc; } int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); ev->reason = 0; ev->cfm_prim = LLC_RESET_PRIM; return 0; } int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf) llc_conn_ac_clear_remote_busy(sk, skb); return 0; } int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->data_flag == 2) del_timer(&llc->rej_sent_timer.timer); return 0; } int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_disc_cmd(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); llc_conn_ac_set_p_flag_1(sk, skb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_dm_rsp(nskb, f_bit); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_dm_rsp(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct sk_buff *nskb; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); struct llc_sock *llc = llc_sk(sk); llc->rx_pdu_hdr = *((u32 *)pdu); if (LLC_PDU_IS_CMD(pdu)) llc_pdu_decode_pf_bit(skb, &f_bit); else f_bit = 0; nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, 0, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct sk_buff *nskb; struct llc_sock *llc = llc_sk(sk); llc_pdu_decode_pf_bit(skb, &f_bit); nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, sizeof(struct llc_frmr_info)); if (nskb) { struct llc_sap *sap = llc->sap; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, llc->vR, INCORRECT); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return 0; } int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); return 0; } int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk, struct sk_buff *skb) { u8 nr; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) llc_conn_send_pdu(sk, nskb); else kfree_skb(skb); } if (rc) { nr = LLC_I_GET_NR(pdu); rc = 0; llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); } return rc; } int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_rsp(sk, nr, 1); return 0; } int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rej_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rej_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->remote_busy_flag) { llc->remote_busy_flag = 1; mod_timer(&llc->busy_state_timer.timer, jiffies + llc->busy_state_timer.expire); } return 0; } int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; u8 f_bit = 1; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 1, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } void llc_conn_set_p_flag(struct sock *sk, u8 value) { int state_changed = llc_sk(sk)->p_flag && !value; llc_sk(sk)->p_flag = value; if (state_changed) sk->sk_state_change(sk); } int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); if (nskb) { struct llc_sap *sap = llc->sap; const u8 *dmac = llc->daddr.mac; if (llc->dev->flags & IFF_LOOPBACK) dmac = llc->dev->dev_addr; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_sabme_cmd(nskb, 1); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); llc_conn_set_p_flag(sk, 1); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) { u8 f_bit; int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_U, 0); llc_pdu_decode_pf_bit(skb, &f_bit); if (nskb) { struct llc_sap *sap = llc->sap; nskb->dev = llc->dev; llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_ua_rsp(nskb, f_bit); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->s_flag = 0; return 0; } int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->s_flag = 1; return 0; } int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); llc_conn_set_p_flag(sk, 1); mod_timer(&llc->pf_cycle_timer.timer, jiffies + llc->pf_cycle_timer.expire); return 0; } /** * llc_conn_ac_send_ack_if_needed - check if ack is needed * @sk: current connection structure * @skb: current event * * Checks number of received PDUs which have not been acknowledged, yet, * If number of them reaches to "npta"(Number of PDUs To Acknowledge) then * sends an RR response as acknowledgement for them. Returns 0 for * success, 1 otherwise. */ int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) { u8 pf_bit; struct llc_sock *llc = llc_sk(sk); llc_pdu_decode_pf_bit(skb, &pf_bit); llc->ack_pf |= pf_bit & 1; if (!llc->ack_must_be_send) { llc->first_pdu_Ns = llc->vR; llc->ack_must_be_send = 1; llc->ack_pf = pf_bit & 1; } if (((llc->vR - llc->first_pdu_Ns + 1 + LLC_2_SEQ_NBR_MODULO) % LLC_2_SEQ_NBR_MODULO) >= llc->npta) { llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0; llc->ack_pf = 0; llc_conn_ac_inc_npta_value(sk, skb); } return 0; } /** * llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag * @sk: current connection structure * @skb: current event * * This action resets ack_must_be_send flag of given connection, this flag * indicates if there is any PDU which has not been acknowledged yet. * Returns 0 for success, 1 otherwise. */ int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0; return 0; } /** * llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs * @sk: current connection structure * @skb: current event * * Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to * all received PDUs which have not been acknowledged, yet. ack_pf flag is * set to one if one PDU with p-bit set to one is received. Returns 0 for * success, 1 otherwise. */ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb) { int rc; struct llc_sock *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); if (likely(!rc)) { skb_get(skb); llc_conn_send_pdu(sk, skb); llc_conn_ac_inc_vs_by_1(sk, skb); } return rc; } /** * llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs * @sk: current connection structure. * @skb: current event. * * This action sends an I-format PDU as acknowledge to received PDUs which * have not been acknowledged, yet, if there is any. By using of this * action number of acknowledgements decreases, this technic is called * piggy backing. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); int ret; if (llc->ack_must_be_send) { ret = llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); llc->ack_must_be_send = 0 ; llc->ack_pf = 0; } else { ret = llc_conn_ac_send_i_cmd_p_set_0(sk, skb); } return ret; } /** * llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked * @sk: current connection structure. * @skb: current event. * * This action sends an RR response with f-bit set to ack_pf flag as * acknowledge to all received PDUs which have not been acknowledged, yet, * if there is any. ack_pf flag indicates if a PDU has been received with * p-bit set to one. Returns 0 for success, 1 otherwise. */ static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, struct sk_buff *skb) { int rc = -ENOBUFS; struct llc_sock *llc = llc_sk(sk); struct sk_buff *nskb = llc_alloc_frame(sk, llc->dev, LLC_PDU_TYPE_S, 0); if (nskb) { struct llc_sap *sap = llc->sap; llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, llc->daddr.lsap, LLC_PDU_RSP); llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR); rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); if (unlikely(rc)) goto free; llc_conn_send_pdu(sk, nskb); } out: return rc; free: kfree_skb(nskb); goto out; } /** * llc_conn_ac_inc_npta_value - tries to make value of npta greater * @sk: current connection structure. * @skb: current event. * * After "inc_cntr" times calling of this action, "npta" increase by one. * this action tries to make vale of "npta" greater as possible; number of * acknowledgements decreases by increasing of "npta". Returns 0 for * success, 1 otherwise. */ static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->inc_cntr) { llc->dec_step = 0; llc->dec_cntr = llc->inc_cntr = 2; ++llc->npta; if (llc->npta > (u8) ~LLC_2_SEQ_NBR_MODULO) llc->npta = (u8) ~LLC_2_SEQ_NBR_MODULO; } else --llc->inc_cntr; return 0; } /** * llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one * @sk: current connection structure. * @skb: current event. * * After receiving "dec_cntr" times RR command, this action decreases * "npta" by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!llc->connect_step && !llc->remote_busy_flag) { if (!llc->dec_step) { if (!llc->dec_cntr) { llc->inc_cntr = llc->dec_cntr = 2; if (llc->npta > 0) llc->npta = llc->npta - 1; } else llc->dec_cntr -=1; } } else llc->connect_step = 0 ; return 0; } /** * llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one * @sk: current connection structure. * @skb: current event. * * After receiving "dec_cntr" times RNR command, this action decreases * "npta" by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (llc->remote_busy_flag) if (!llc->dec_step) { if (!llc->dec_cntr) { llc->inc_cntr = llc->dec_cntr = 2; if (llc->npta > 0) --llc->npta; } else --llc->dec_cntr; } return 0; } /** * llc_conn_ac_dec_tx_win_size - decreases tx window size * @sk: current connection structure. * @skb: current event. * * After receiving of a REJ command or response, transmit window size is * decreased by number of PDUs which are outstanding yet. Returns 0 for * success, 1 otherwise. */ int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); if (llc->k - unacked_pdu < 1) llc->k = 1; else llc->k -= unacked_pdu; return 0; } /** * llc_conn_ac_inc_tx_win_size - tx window size is inc by 1 * @sk: current connection structure. * @skb: current event. * * After receiving an RR response with f-bit set to one, transmit window * size is increased by one. Returns 0 for success, 1 otherwise. */ int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); llc->k += 1; if (llc->k > (u8) ~LLC_2_SEQ_NBR_MODULO) llc->k = (u8) ~LLC_2_SEQ_NBR_MODULO; return 0; } int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb) { llc_sk_stop_all_timers(sk, false); return 0; } int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); del_timer(&llc->rej_sent_timer.timer); del_timer(&llc->pf_cycle_timer.timer); del_timer(&llc->busy_state_timer.timer); llc->ack_must_be_send = 0; llc->ack_pf = 0; return 0; } int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); return 0; } int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); mod_timer(&llc->rej_sent_timer.timer, jiffies + llc->rej_sent_timer.expire); return 0; } int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); if (!timer_pending(&llc->ack_timer.timer)) mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); return 0; } int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb) { del_timer(&llc_sk(sk)->ack_timer.timer); return 0; } int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); del_timer(&llc->pf_cycle_timer.timer); llc_conn_set_p_flag(sk, 0); return 0; } int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb) { del_timer(&llc_sk(sk)->rej_sent_timer.timer); return 0; } int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb) { int acked; u16 unacked = 0; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); struct llc_sock *llc = llc_sk(sk); llc->last_nr = PDU_SUPV_GET_Nr(pdu); acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked); /* On loopback we don't queue I frames in unack_pdu_q queue. */ if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) { llc->retry_count = 0; del_timer(&llc->ack_timer.timer); if (llc->failed_data_req) { /* already, we did not accept data from upper layer * (tx_window full or unacceptable state). Now, we * can send data and must inform to upper layer. */ llc->failed_data_req = 0; llc_conn_ac_data_confirm(sk, skb); } if (unacked) mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire); } else if (llc->failed_data_req) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); if (f_bit == 1) { llc->failed_data_req = 0; llc_conn_ac_data_confirm(sk, skb); } } return 0; } int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if (LLC_PDU_IS_RSP(pdu)) { u8 f_bit; llc_pdu_decode_pf_bit(skb, &f_bit); if (f_bit) { llc_conn_set_p_flag(sk, 0); llc_conn_ac_stop_p_timer(sk, skb); } } return 0; } int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 2; return 0; } int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 0; return 0; } int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->data_flag = 1; return 0; } int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) { if (!llc_sk(sk)->data_flag) llc_sk(sk)->data_flag = 1; return 0; } int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb) { llc_conn_set_p_flag(sk, 0); return 0; } static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb) { llc_conn_set_p_flag(sk, 1); return 0; } int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->remote_busy_flag = 0; return 0; } int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->cause_flag = 0; return 0; } int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->cause_flag = 1; return 0; } int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->retry_count = 0; return 0; } int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->retry_count++; return 0; } int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vR = 0; return 0; } int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR); return 0; } int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = 0; return 0; } int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = llc_sk(sk)->last_nr; return 0; } static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % LLC_2_SEQ_NBR_MODULO; return 0; } static void llc_conn_tmr_common_cb(struct sock *sk, u8 type) { struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); bh_lock_sock(sk); if (skb) { struct llc_conn_state_ev *ev = llc_conn_ev(skb); skb_set_owner_r(skb, sk); ev->type = type; llc_process_tmr_ev(sk, skb); } bh_unlock_sock(sk); } void llc_conn_pf_cycle_tmr_cb(struct timer_list *t) { struct llc_sock *llc = from_timer(llc, t, pf_cycle_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR); } void llc_conn_busy_tmr_cb(struct timer_list *t) { struct llc_sock *llc = from_timer(llc, t, busy_state_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR); } void llc_conn_ack_tmr_cb(struct timer_list *t) { struct llc_sock *llc = from_timer(llc, t, ack_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR); } void llc_conn_rej_tmr_cb(struct timer_list *t) { struct llc_sock *llc = from_timer(llc, t, rej_sent_timer.timer); llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR); } int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb) { llc_sk(sk)->X = llc_sk(sk)->vS; llc_conn_ac_set_vs_nr(sk, skb); return 0; } int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); u8 nr = PDU_SUPV_GET_Nr(pdu); if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X)) llc_conn_ac_set_vs_nr(sk, skb); return 0; } /* * Non-standard actions; these not contained in IEEE specification; for * our own usage */ /** * llc_conn_disc - removes connection from SAP list and frees it * @sk: closed connection * @skb: occurred event */ int llc_conn_disc(struct sock *sk, struct sk_buff *skb) { /* FIXME: this thing seems to want to die */ return 0; } /** * llc_conn_reset - resets connection * @sk : reseting connection. * @skb: occurred event. * * Stop all timers, empty all queues and reset all flags. */ int llc_conn_reset(struct sock *sk, struct sk_buff *skb) { llc_sk_reset(sk); return 0; } /** * llc_circular_between - designates that b is between a and c or not * @a: lower bound * @b: element to see if is between a and b * @c: upper bound * * This function designates that b is between a and c or not (for example, * 0 is between 127 and 1). Returns 1 if b is between a and c, 0 * otherwise. */ u8 llc_circular_between(u8 a, u8 b, u8 c) { b = b - a; c = c - a; return b <= c; } /** * llc_process_tmr_ev - timer backend * @sk: active connection * @skb: occurred event * * This function is called from timer callback functions. When connection * is busy (during sending a data frame) timer expiration event must be * queued. Otherwise this event can be sent to connection state machine. * Queued events will process by llc_backlog_rcv function after sending * data frame. */ static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb) { if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) { printk(KERN_WARNING "%s: timer called on closed connection\n", __func__); kfree_skb(skb); } else { if (!sock_owned_by_user(sk)) llc_conn_state_process(sk, skb); else { llc_set_backlog_type(skb, LLC_EVENT); __sk_add_backlog(sk, skb); } } } |
1 2 2 2 3 1 3 1 1 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | // SPDX-License-Identifier: GPL-2.0-only /* * TCP Veno congestion control * * This is based on the congestion detection/avoidance scheme described in * C. P. Fu, S. C. Liew. * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> /* Default values of the Veno variables, in fixed-point representation * with V_PARAM_SHIFT bits to the right of the binary point. */ #define V_PARAM_SHIFT 1 static const int beta = 3 << V_PARAM_SHIFT; /* Veno variables */ struct veno { u8 doing_veno_now; /* if true, do veno for this rtt */ u16 cntrtt; /* # of rtts measured within last rtt */ u32 minrtt; /* min of rtts measured within last rtt (in usec) */ u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ }; /* There are several situations when we must "re-start" Veno: * * o when a connection is established * o after an RTO * o after fast recovery * o when we send a packet and there is no outstanding * unacknowledged data (restarting an idle connection) * */ static inline void veno_enable(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); /* turn on Veno */ veno->doing_veno_now = 1; veno->minrtt = 0x7fffffff; } static inline void veno_disable(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); /* turn off Veno */ veno->doing_veno_now = 0; } static void tcp_veno_init(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); veno->basertt = 0x7fffffff; veno->inc = 1; veno_enable(sk); } /* Do rtt sampling needed for Veno. */ static void tcp_veno_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct veno *veno = inet_csk_ca(sk); u32 vrtt; if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) veno->basertt = vrtt; /* Find the min rtt during the last rtt to find * the current prop. delay + queuing delay: */ veno->minrtt = min(veno->minrtt, vrtt); veno->cntrtt++; } static void tcp_veno_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) veno_enable(sk); else veno_disable(sk); } /* * If the connection is idle and we are restarting, * then we don't want to do any Veno calculations * until we get fresh rtt samples. So when we * restart, we reset our Veno state to a clean * state. After we get acks for this flight of * packets, _then_ we can make Veno calculations * again. */ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_veno_init(sk); } static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ if (veno->cntrtt <= 2) { /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; /* We have enough rtt samples, so, using the Veno * algorithm, we determine the state of the network. */ rtt = veno->minrtt; target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt; target_cwnd <<= V_PARAM_SHIFT; do_div(target_cwnd, rtt); veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd; if (tcp_in_slow_start(tp)) { /* Slow start. */ acked = tcp_slow_start(tp, acked); if (!acked) goto done; } /* Congestion avoidance. */ if (veno->diff < beta) { /* In the "non-congestive state", increase cwnd * every rtt. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } else { /* In the "congestive state", increase cwnd * every other rtt. */ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { if (veno->inc && tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) { tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); veno->inc = 0; } else veno->inc = 1; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt += acked; } done: if (tcp_snd_cwnd(tp) < 2) tcp_snd_cwnd_set(tp, 2); else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); } /* Wipe the slate clean for the next rtt. */ /* veno->cntrtt = 0; */ veno->minrtt = 0x7fffffff; } /* Veno MD phase */ static u32 tcp_veno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tcp_snd_cwnd(tp) * 4 / 5, 2U); else /* in "congestive state", cut cwnd by 1/2 */ return max(tcp_snd_cwnd(tp) >> 1U, 2U); } static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, .owner = THIS_MODULE, .name = "veno", }; static int __init tcp_veno_register(void) { BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_veno); return 0; } static void __exit tcp_veno_unregister(void) { tcp_unregister_congestion_control(&tcp_veno); } module_init(tcp_veno_register); module_exit(tcp_veno_unregister); MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Veno"); |
3 3 3 3 14 14 14 3 1 2 35 34 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2004, Instant802 Networks, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/types.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wme.h" /* Default mapping in classifier to work with default * queue setup. */ const int ieee802_1d_to_ac[8] = { IEEE80211_AC_BE, IEEE80211_AC_BK, IEEE80211_AC_BK, IEEE80211_AC_BE, IEEE80211_AC_VI, IEEE80211_AC_VI, IEEE80211_AC_VO, IEEE80211_AC_VO }; static int wme_downgrade_ac(struct sk_buff *skb) { switch (skb->priority) { case 6: case 7: skb->priority = 5; /* VO -> VI */ return 0; case 4: case 5: skb->priority = 3; /* VI -> BE */ return 0; case 0: case 3: skb->priority = 2; /* BE -> BK */ return 0; default: return -1; } } /** * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved * @tid: the assumed-reserved TID * * Returns: the alternative TID to use, or 0 on error */ static inline u8 ieee80211_fix_reserved_tid(u8 tid) { switch (tid) { case 0: return 3; case 1: return 2; case 2: return 1; case 3: return 0; case 4: return 5; case 5: return 4; case 6: return 7; case 7: return 6; } return 0; } static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* in case we are a client verify acm is not set for this ac */ while (sdata->wmm_acm & BIT(skb->priority)) { int ac = ieee802_1d_to_ac[skb->priority]; if (ifmgd->tx_tspec[ac].admitted_time && skb->priority == ifmgd->tx_tspec[ac].up) return ac; if (wme_downgrade_ac(skb)) { /* * This should not really happen. The AP has marked all * lower ACs to require admission control which is not * a reasonable configuration. Allow the frame to be * transmitted using AC_BK as a workaround. */ break; } } /* Check to see if this is a reserved TID */ if (sta && sta->reserved_tid == skb->priority) skb->priority = ieee80211_fix_reserved_tid(skb->priority); /* look up which queue to use for frames with this 1d tag */ return ieee802_1d_to_ac[skb->priority]; } /* Indicate which queue to use for this fully formed 802.11 frame */ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { skb->priority = 7; return ieee802_1d_to_ac[skb->priority]; } if (!ieee80211_is_data_qos(hdr->frame_control)) { skb->priority = 0; return ieee802_1d_to_ac[skb->priority]; } p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; return ieee80211_downgrade_queue(sdata, NULL, skb); } u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); /* all mesh/ocb stations are required to support WME */ if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && !is_multicast_ether_addr(eth->h_dest)) || (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; else qos = false; if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } if (skb->protocol == sdata->control_port_protocol) { skb->priority = 7; goto downgrade; } /* use the data classifier to determine what 802.1d tag the * data frame has */ qos_map = rcu_dereference(sdata->qos_map); skb->priority = cfg80211_classify8021d(skb, qos_map ? &qos_map->qos_map : NULL); downgrade: return ieee80211_downgrade_queue(sdata, sta, skb); } /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * * @sdata: local subif * @skb: packet to be updated */ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; u8 flags; u8 *p; if (!ieee80211_is_data_qos(hdr->frame_control)) return; p = ieee80211_get_qos_ctl(hdr); /* don't overwrite the QoS field of injected frames */ if (info->flags & IEEE80211_TX_CTL_INJECTED) { /* do take into account Ack policy of injected frames */ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; return; } /* set up the first byte */ /* * preserve everything but the TID and ACK policy * (which we both write here) */ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_ACK_POLICY_MASK); if (is_multicast_ether_addr(hdr->addr1) || sdata->noack_map & BIT(tid)) { flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; info->flags |= IEEE80211_TX_CTL_NO_ACK; } *p = flags | tid; /* set up the second byte */ p++; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* preserve RSPI and Mesh PS Level bit */ *p &= ((IEEE80211_QOS_CTL_RSPI | IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); /* Nulls don't have a mesh header (frame body) */ if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); } else { *p = 0; } } |
942 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* Copyright (c) 2002-2007 Volkswagen Group Electronic Research * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de> * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #ifndef CAN_ML_H #define CAN_ML_H #include <linux/can.h> #include <linux/list.h> #include <linux/netdevice.h> #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX }; struct can_dev_rcv_lists { struct hlist_head rx[RX_MAX]; struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ]; struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ]; int entries; }; struct can_ml_priv { struct can_dev_rcv_lists dev_rcv_lists; #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) { return netdev_get_ml_priv(dev, ML_PRIV_CAN); } static inline void can_set_ml_priv(struct net_device *dev, struct can_ml_priv *ml_priv) { netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } #endif /* CAN_ML_H */ |
5 36 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || i_uid_needs_update(idmap, ia, inode) || i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 0) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */ |
7 1 5 6 6 6 6 6 6 6 6 6 6 5 6 6 6 6 6 4 4 4 4 4 4 3 4 3 3 3 3 3 3 3 3 3 3 3 1 1 1 2 2 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Common Twofish algorithm parts shared between the c and assembler * implementations * * Originally Twofish for GPG * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 * 256-bit key length added March 20, 1999 * Some modifications to reduce the text size by Werner Koch, April, 1998 * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> * * The original author has disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * This code is a "clean room" implementation, written from the paper * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available * through http://www.counterpane.com/twofish.html * * For background information on multiplication in finite fields, used for * the matrix operations in the key schedule, see the book _Contemporary * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the * Third Edition. */ #include <crypto/algapi.h> #include <crypto/twofish.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> /* The large precomputed tables for the Twofish cipher (twofish.c) * Taken from the same source as twofish.c * Marc Mutz <Marc@Mutz.com> */ /* These two tables are the q0 and q1 permutations, exactly as described in * the Twofish paper. */ static const u8 q0[256] = { 0xA9, 0x67, 0xB3, 0xE8, 0x04, 0xFD, 0xA3, 0x76, 0x9A, 0x92, 0x80, 0x78, 0xE4, 0xDD, 0xD1, 0x38, 0x0D, 0xC6, 0x35, 0x98, 0x18, 0xF7, 0xEC, 0x6C, 0x43, 0x75, 0x37, 0x26, 0xFA, 0x13, 0x94, 0x48, 0xF2, 0xD0, 0x8B, 0x30, 0x84, 0x54, 0xDF, 0x23, 0x19, 0x5B, 0x3D, 0x59, 0xF3, 0xAE, 0xA2, 0x82, 0x63, 0x01, 0x83, 0x2E, 0xD9, 0x51, 0x9B, 0x7C, 0xA6, 0xEB, 0xA5, 0xBE, 0x16, 0x0C, 0xE3, 0x61, 0xC0, 0x8C, 0x3A, 0xF5, 0x73, 0x2C, 0x25, 0x0B, 0xBB, 0x4E, 0x89, 0x6B, 0x53, 0x6A, 0xB4, 0xF1, 0xE1, 0xE6, 0xBD, 0x45, 0xE2, 0xF4, 0xB6, 0x66, 0xCC, 0x95, 0x03, 0x56, 0xD4, 0x1C, 0x1E, 0xD7, 0xFB, 0xC3, 0x8E, 0xB5, 0xE9, 0xCF, 0xBF, 0xBA, 0xEA, 0x77, 0x39, 0xAF, 0x33, 0xC9, 0x62, 0x71, 0x81, 0x79, 0x09, 0xAD, 0x24, 0xCD, 0xF9, 0xD8, 0xE5, 0xC5, 0xB9, 0x4D, 0x44, 0x08, 0x86, 0xE7, 0xA1, 0x1D, 0xAA, 0xED, 0x06, 0x70, 0xB2, 0xD2, 0x41, 0x7B, 0xA0, 0x11, 0x31, 0xC2, 0x27, 0x90, 0x20, 0xF6, 0x60, 0xFF, 0x96, 0x5C, 0xB1, 0xAB, 0x9E, 0x9C, 0x52, 0x1B, 0x5F, 0x93, 0x0A, 0xEF, 0x91, 0x85, 0x49, 0xEE, 0x2D, 0x4F, 0x8F, 0x3B, 0x47, 0x87, 0x6D, 0x46, 0xD6, 0x3E, 0x69, 0x64, 0x2A, 0xCE, 0xCB, 0x2F, 0xFC, 0x97, 0x05, 0x7A, 0xAC, 0x7F, 0xD5, 0x1A, 0x4B, 0x0E, 0xA7, 0x5A, 0x28, 0x14, 0x3F, 0x29, 0x88, 0x3C, 0x4C, 0x02, 0xB8, 0xDA, 0xB0, 0x17, 0x55, 0x1F, 0x8A, 0x7D, 0x57, 0xC7, 0x8D, 0x74, 0xB7, 0xC4, 0x9F, 0x72, 0x7E, 0x15, 0x22, 0x12, 0x58, 0x07, 0x99, 0x34, 0x6E, 0x50, 0xDE, 0x68, 0x65, 0xBC, 0xDB, 0xF8, 0xC8, 0xA8, 0x2B, 0x40, 0xDC, 0xFE, 0x32, 0xA4, 0xCA, 0x10, 0x21, 0xF0, 0xD3, 0x5D, 0x0F, 0x00, 0x6F, 0x9D, 0x36, 0x42, 0x4A, 0x5E, 0xC1, 0xE0 }; static const u8 q1[256] = { 0x75, 0xF3, 0xC6, 0xF4, 0xDB, 0x7B, 0xFB, 0xC8, 0x4A, 0xD3, 0xE6, 0x6B, 0x45, 0x7D, 0xE8, 0x4B, 0xD6, 0x32, 0xD8, 0xFD, 0x37, 0x71, 0xF1, 0xE1, 0x30, 0x0F, 0xF8, 0x1B, 0x87, 0xFA, 0x06, 0x3F, 0x5E, 0xBA, 0xAE, 0x5B, 0x8A, 0x00, 0xBC, 0x9D, 0x6D, 0xC1, 0xB1, 0x0E, 0x80, 0x5D, 0xD2, 0xD5, 0xA0, 0x84, 0x07, 0x14, 0xB5, 0x90, 0x2C, 0xA3, 0xB2, 0x73, 0x4C, 0x54, 0x92, 0x74, 0x36, 0x51, 0x38, 0xB0, 0xBD, 0x5A, 0xFC, 0x60, 0x62, 0x96, 0x6C, 0x42, 0xF7, 0x10, 0x7C, 0x28, 0x27, 0x8C, 0x13, 0x95, 0x9C, 0xC7, 0x24, 0x46, 0x3B, 0x70, 0xCA, 0xE3, 0x85, 0xCB, 0x11, 0xD0, 0x93, 0xB8, 0xA6, 0x83, 0x20, 0xFF, 0x9F, 0x77, 0xC3, 0xCC, 0x03, 0x6F, 0x08, 0xBF, 0x40, 0xE7, 0x2B, 0xE2, 0x79, 0x0C, 0xAA, 0x82, 0x41, 0x3A, 0xEA, 0xB9, 0xE4, 0x9A, 0xA4, 0x97, 0x7E, 0xDA, 0x7A, 0x17, 0x66, 0x94, 0xA1, 0x1D, 0x3D, 0xF0, 0xDE, 0xB3, 0x0B, 0x72, 0xA7, 0x1C, 0xEF, 0xD1, 0x53, 0x3E, 0x8F, 0x33, 0x26, 0x5F, 0xEC, 0x76, 0x2A, 0x49, 0x81, 0x88, 0xEE, 0x21, 0xC4, 0x1A, 0xEB, 0xD9, 0xC5, 0x39, 0x99, 0xCD, 0xAD, 0x31, 0x8B, 0x01, 0x18, 0x23, 0xDD, 0x1F, 0x4E, 0x2D, 0xF9, 0x48, 0x4F, 0xF2, 0x65, 0x8E, 0x78, 0x5C, 0x58, 0x19, 0x8D, 0xE5, 0x98, 0x57, 0x67, 0x7F, 0x05, 0x64, 0xAF, 0x63, 0xB6, 0xFE, 0xF5, 0xB7, 0x3C, 0xA5, 0xCE, 0xE9, 0x68, 0x44, 0xE0, 0x4D, 0x43, 0x69, 0x29, 0x2E, 0xAC, 0x15, 0x59, 0xA8, 0x0A, 0x9E, 0x6E, 0x47, 0xDF, 0x34, 0x35, 0x6A, 0xCF, 0xDC, 0x22, 0xC9, 0xC0, 0x9B, 0x89, 0xD4, 0xED, 0xAB, 0x12, 0xA2, 0x0D, 0x52, 0xBB, 0x02, 0x2F, 0xA9, 0xD7, 0x61, 0x1E, 0xB4, 0x50, 0x04, 0xF6, 0xC2, 0x16, 0x25, 0x86, 0x56, 0x55, 0x09, 0xBE, 0x91 }; /* These MDS tables are actually tables of MDS composed with q0 and q1, * because it is only ever used that way and we can save some time by * precomputing. Of course the main saving comes from precomputing the * GF(2^8) multiplication involved in the MDS matrix multiply; by looking * things up in these tables we reduce the matrix multiply to four lookups * and three XORs. Semi-formally, the definition of these tables is: * mds[0][i] = MDS (q1[i] 0 0 0)^T mds[1][i] = MDS (0 q0[i] 0 0)^T * mds[2][i] = MDS (0 0 q1[i] 0)^T mds[3][i] = MDS (0 0 0 q0[i])^T * where ^T means "transpose", the matrix multiply is performed in GF(2^8) * represented as GF(2)[x]/v(x) where v(x)=x^8+x^6+x^5+x^3+1 as described * by Schneier et al, and I'm casually glossing over the byte/word * conversion issues. */ static const u32 mds[4][256] = { { 0xBCBC3275, 0xECEC21F3, 0x202043C6, 0xB3B3C9F4, 0xDADA03DB, 0x02028B7B, 0xE2E22BFB, 0x9E9EFAC8, 0xC9C9EC4A, 0xD4D409D3, 0x18186BE6, 0x1E1E9F6B, 0x98980E45, 0xB2B2387D, 0xA6A6D2E8, 0x2626B74B, 0x3C3C57D6, 0x93938A32, 0x8282EED8, 0x525298FD, 0x7B7BD437, 0xBBBB3771, 0x5B5B97F1, 0x474783E1, 0x24243C30, 0x5151E20F, 0xBABAC6F8, 0x4A4AF31B, 0xBFBF4887, 0x0D0D70FA, 0xB0B0B306, 0x7575DE3F, 0xD2D2FD5E, 0x7D7D20BA, 0x666631AE, 0x3A3AA35B, 0x59591C8A, 0x00000000, 0xCDCD93BC, 0x1A1AE09D, 0xAEAE2C6D, 0x7F7FABC1, 0x2B2BC7B1, 0xBEBEB90E, 0xE0E0A080, 0x8A8A105D, 0x3B3B52D2, 0x6464BAD5, 0xD8D888A0, 0xE7E7A584, 0x5F5FE807, 0x1B1B1114, 0x2C2CC2B5, 0xFCFCB490, 0x3131272C, 0x808065A3, 0x73732AB2, 0x0C0C8173, 0x79795F4C, 0x6B6B4154, 0x4B4B0292, 0x53536974, 0x94948F36, 0x83831F51, 0x2A2A3638, 0xC4C49CB0, 0x2222C8BD, 0xD5D5F85A, 0xBDBDC3FC, 0x48487860, 0xFFFFCE62, 0x4C4C0796, 0x4141776C, 0xC7C7E642, 0xEBEB24F7, 0x1C1C1410, 0x5D5D637C, 0x36362228, 0x6767C027, 0xE9E9AF8C, 0x4444F913, 0x1414EA95, 0xF5F5BB9C, 0xCFCF18C7, 0x3F3F2D24, 0xC0C0E346, 0x7272DB3B, 0x54546C70, 0x29294CCA, 0xF0F035E3, 0x0808FE85, 0xC6C617CB, 0xF3F34F11, 0x8C8CE4D0, 0xA4A45993, 0xCACA96B8, 0x68683BA6, 0xB8B84D83, 0x38382820, 0xE5E52EFF, 0xADAD569F, 0x0B0B8477, 0xC8C81DC3, 0x9999FFCC, 0x5858ED03, 0x19199A6F, 0x0E0E0A08, 0x95957EBF, 0x70705040, 0xF7F730E7, 0x6E6ECF2B, 0x1F1F6EE2, 0xB5B53D79, 0x09090F0C, 0x616134AA, 0x57571682, 0x9F9F0B41, 0x9D9D803A, 0x111164EA, 0x2525CDB9, 0xAFAFDDE4, 0x4545089A, 0xDFDF8DA4, 0xA3A35C97, 0xEAEAD57E, 0x353558DA, 0xEDEDD07A, 0x4343FC17, 0xF8F8CB66, 0xFBFBB194, 0x3737D3A1, 0xFAFA401D, 0xC2C2683D, 0xB4B4CCF0, 0x32325DDE, 0x9C9C71B3, 0x5656E70B, 0xE3E3DA72, 0x878760A7, 0x15151B1C, 0xF9F93AEF, 0x6363BFD1, 0x3434A953, 0x9A9A853E, 0xB1B1428F, 0x7C7CD133, 0x88889B26, 0x3D3DA65F, 0xA1A1D7EC, 0xE4E4DF76, 0x8181942A, 0x91910149, 0x0F0FFB81, 0xEEEEAA88, 0x161661EE, 0xD7D77321, 0x9797F5C4, 0xA5A5A81A, 0xFEFE3FEB, 0x6D6DB5D9, 0x7878AEC5, 0xC5C56D39, 0x1D1DE599, 0x7676A4CD, 0x3E3EDCAD, 0xCBCB6731, 0xB6B6478B, 0xEFEF5B01, 0x12121E18, 0x6060C523, 0x6A6AB0DD, 0x4D4DF61F, 0xCECEE94E, 0xDEDE7C2D, 0x55559DF9, 0x7E7E5A48, 0x2121B24F, 0x03037AF2, 0xA0A02665, 0x5E5E198E, 0x5A5A6678, 0x65654B5C, 0x62624E58, 0xFDFD4519, 0x0606F48D, 0x404086E5, 0xF2F2BE98, 0x3333AC57, 0x17179067, 0x05058E7F, 0xE8E85E05, 0x4F4F7D64, 0x89896AAF, 0x10109563, 0x74742FB6, 0x0A0A75FE, 0x5C5C92F5, 0x9B9B74B7, 0x2D2D333C, 0x3030D6A5, 0x2E2E49CE, 0x494989E9, 0x46467268, 0x77775544, 0xA8A8D8E0, 0x9696044D, 0x2828BD43, 0xA9A92969, 0xD9D97929, 0x8686912E, 0xD1D187AC, 0xF4F44A15, 0x8D8D1559, 0xD6D682A8, 0xB9B9BC0A, 0x42420D9E, 0xF6F6C16E, 0x2F2FB847, 0xDDDD06DF, 0x23233934, 0xCCCC6235, 0xF1F1C46A, 0xC1C112CF, 0x8585EBDC, 0x8F8F9E22, 0x7171A1C9, 0x9090F0C0, 0xAAAA539B, 0x0101F189, 0x8B8BE1D4, 0x4E4E8CED, 0x8E8E6FAB, 0xABABA212, 0x6F6F3EA2, 0xE6E6540D, 0xDBDBF252, 0x92927BBB, 0xB7B7B602, 0x6969CA2F, 0x3939D9A9, 0xD3D30CD7, 0xA7A72361, 0xA2A2AD1E, 0xC3C399B4, 0x6C6C4450, 0x07070504, 0x04047FF6, 0x272746C2, 0xACACA716, 0xD0D07625, 0x50501386, 0xDCDCF756, 0x84841A55, 0xE1E15109, 0x7A7A25BE, 0x1313EF91}, { 0xA9D93939, 0x67901717, 0xB3719C9C, 0xE8D2A6A6, 0x04050707, 0xFD985252, 0xA3658080, 0x76DFE4E4, 0x9A084545, 0x92024B4B, 0x80A0E0E0, 0x78665A5A, 0xE4DDAFAF, 0xDDB06A6A, 0xD1BF6363, 0x38362A2A, 0x0D54E6E6, 0xC6432020, 0x3562CCCC, 0x98BEF2F2, 0x181E1212, 0xF724EBEB, 0xECD7A1A1, 0x6C774141, 0x43BD2828, 0x7532BCBC, 0x37D47B7B, 0x269B8888, 0xFA700D0D, 0x13F94444, 0x94B1FBFB, 0x485A7E7E, 0xF27A0303, 0xD0E48C8C, 0x8B47B6B6, 0x303C2424, 0x84A5E7E7, 0x54416B6B, 0xDF06DDDD, 0x23C56060, 0x1945FDFD, 0x5BA33A3A, 0x3D68C2C2, 0x59158D8D, 0xF321ECEC, 0xAE316666, 0xA23E6F6F, 0x82165757, 0x63951010, 0x015BEFEF, 0x834DB8B8, 0x2E918686, 0xD9B56D6D, 0x511F8383, 0x9B53AAAA, 0x7C635D5D, 0xA63B6868, 0xEB3FFEFE, 0xA5D63030, 0xBE257A7A, 0x16A7ACAC, 0x0C0F0909, 0xE335F0F0, 0x6123A7A7, 0xC0F09090, 0x8CAFE9E9, 0x3A809D9D, 0xF5925C5C, 0x73810C0C, 0x2C273131, 0x2576D0D0, 0x0BE75656, 0xBB7B9292, 0x4EE9CECE, 0x89F10101, 0x6B9F1E1E, 0x53A93434, 0x6AC4F1F1, 0xB499C3C3, 0xF1975B5B, 0xE1834747, 0xE66B1818, 0xBDC82222, 0x450E9898, 0xE26E1F1F, 0xF4C9B3B3, 0xB62F7474, 0x66CBF8F8, 0xCCFF9999, 0x95EA1414, 0x03ED5858, 0x56F7DCDC, 0xD4E18B8B, 0x1C1B1515, 0x1EADA2A2, 0xD70CD3D3, 0xFB2BE2E2, 0xC31DC8C8, 0x8E195E5E, 0xB5C22C2C, 0xE9894949, 0xCF12C1C1, 0xBF7E9595, 0xBA207D7D, 0xEA641111, 0x77840B0B, 0x396DC5C5, 0xAF6A8989, 0x33D17C7C, 0xC9A17171, 0x62CEFFFF, 0x7137BBBB, 0x81FB0F0F, 0x793DB5B5, 0x0951E1E1, 0xADDC3E3E, 0x242D3F3F, 0xCDA47676, 0xF99D5555, 0xD8EE8282, 0xE5864040, 0xC5AE7878, 0xB9CD2525, 0x4D049696, 0x44557777, 0x080A0E0E, 0x86135050, 0xE730F7F7, 0xA1D33737, 0x1D40FAFA, 0xAA346161, 0xED8C4E4E, 0x06B3B0B0, 0x706C5454, 0xB22A7373, 0xD2523B3B, 0x410B9F9F, 0x7B8B0202, 0xA088D8D8, 0x114FF3F3, 0x3167CBCB, 0xC2462727, 0x27C06767, 0x90B4FCFC, 0x20283838, 0xF67F0404, 0x60784848, 0xFF2EE5E5, 0x96074C4C, 0x5C4B6565, 0xB1C72B2B, 0xAB6F8E8E, 0x9E0D4242, 0x9CBBF5F5, 0x52F2DBDB, 0x1BF34A4A, 0x5FA63D3D, 0x9359A4A4, 0x0ABCB9B9, 0xEF3AF9F9, 0x91EF1313, 0x85FE0808, 0x49019191, 0xEE611616, 0x2D7CDEDE, 0x4FB22121, 0x8F42B1B1, 0x3BDB7272, 0x47B82F2F, 0x8748BFBF, 0x6D2CAEAE, 0x46E3C0C0, 0xD6573C3C, 0x3E859A9A, 0x6929A9A9, 0x647D4F4F, 0x2A948181, 0xCE492E2E, 0xCB17C6C6, 0x2FCA6969, 0xFCC3BDBD, 0x975CA3A3, 0x055EE8E8, 0x7AD0EDED, 0xAC87D1D1, 0x7F8E0505, 0xD5BA6464, 0x1AA8A5A5, 0x4BB72626, 0x0EB9BEBE, 0xA7608787, 0x5AF8D5D5, 0x28223636, 0x14111B1B, 0x3FDE7575, 0x2979D9D9, 0x88AAEEEE, 0x3C332D2D, 0x4C5F7979, 0x02B6B7B7, 0xB896CACA, 0xDA583535, 0xB09CC4C4, 0x17FC4343, 0x551A8484, 0x1FF64D4D, 0x8A1C5959, 0x7D38B2B2, 0x57AC3333, 0xC718CFCF, 0x8DF40606, 0x74695353, 0xB7749B9B, 0xC4F59797, 0x9F56ADAD, 0x72DAE3E3, 0x7ED5EAEA, 0x154AF4F4, 0x229E8F8F, 0x12A2ABAB, 0x584E6262, 0x07E85F5F, 0x99E51D1D, 0x34392323, 0x6EC1F6F6, 0x50446C6C, 0xDE5D3232, 0x68724646, 0x6526A0A0, 0xBC93CDCD, 0xDB03DADA, 0xF8C6BABA, 0xC8FA9E9E, 0xA882D6D6, 0x2BCF6E6E, 0x40507070, 0xDCEB8585, 0xFE750A0A, 0x328A9393, 0xA48DDFDF, 0xCA4C2929, 0x10141C1C, 0x2173D7D7, 0xF0CCB4B4, 0xD309D4D4, 0x5D108A8A, 0x0FE25151, 0x00000000, 0x6F9A1919, 0x9DE01A1A, 0x368F9494, 0x42E6C7C7, 0x4AECC9C9, 0x5EFDD2D2, 0xC1AB7F7F, 0xE0D8A8A8}, { 0xBC75BC32, 0xECF3EC21, 0x20C62043, 0xB3F4B3C9, 0xDADBDA03, 0x027B028B, 0xE2FBE22B, 0x9EC89EFA, 0xC94AC9EC, 0xD4D3D409, 0x18E6186B, 0x1E6B1E9F, 0x9845980E, 0xB27DB238, 0xA6E8A6D2, 0x264B26B7, 0x3CD63C57, 0x9332938A, 0x82D882EE, 0x52FD5298, 0x7B377BD4, 0xBB71BB37, 0x5BF15B97, 0x47E14783, 0x2430243C, 0x510F51E2, 0xBAF8BAC6, 0x4A1B4AF3, 0xBF87BF48, 0x0DFA0D70, 0xB006B0B3, 0x753F75DE, 0xD25ED2FD, 0x7DBA7D20, 0x66AE6631, 0x3A5B3AA3, 0x598A591C, 0x00000000, 0xCDBCCD93, 0x1A9D1AE0, 0xAE6DAE2C, 0x7FC17FAB, 0x2BB12BC7, 0xBE0EBEB9, 0xE080E0A0, 0x8A5D8A10, 0x3BD23B52, 0x64D564BA, 0xD8A0D888, 0xE784E7A5, 0x5F075FE8, 0x1B141B11, 0x2CB52CC2, 0xFC90FCB4, 0x312C3127, 0x80A38065, 0x73B2732A, 0x0C730C81, 0x794C795F, 0x6B546B41, 0x4B924B02, 0x53745369, 0x9436948F, 0x8351831F, 0x2A382A36, 0xC4B0C49C, 0x22BD22C8, 0xD55AD5F8, 0xBDFCBDC3, 0x48604878, 0xFF62FFCE, 0x4C964C07, 0x416C4177, 0xC742C7E6, 0xEBF7EB24, 0x1C101C14, 0x5D7C5D63, 0x36283622, 0x672767C0, 0xE98CE9AF, 0x441344F9, 0x149514EA, 0xF59CF5BB, 0xCFC7CF18, 0x3F243F2D, 0xC046C0E3, 0x723B72DB, 0x5470546C, 0x29CA294C, 0xF0E3F035, 0x088508FE, 0xC6CBC617, 0xF311F34F, 0x8CD08CE4, 0xA493A459, 0xCAB8CA96, 0x68A6683B, 0xB883B84D, 0x38203828, 0xE5FFE52E, 0xAD9FAD56, 0x0B770B84, 0xC8C3C81D, 0x99CC99FF, 0x580358ED, 0x196F199A, 0x0E080E0A, 0x95BF957E, 0x70407050, 0xF7E7F730, 0x6E2B6ECF, 0x1FE21F6E, 0xB579B53D, 0x090C090F, 0x61AA6134, 0x57825716, 0x9F419F0B, 0x9D3A9D80, 0x11EA1164, 0x25B925CD, 0xAFE4AFDD, 0x459A4508, 0xDFA4DF8D, 0xA397A35C, 0xEA7EEAD5, 0x35DA3558, 0xED7AEDD0, 0x431743FC, 0xF866F8CB, 0xFB94FBB1, 0x37A137D3, 0xFA1DFA40, 0xC23DC268, 0xB4F0B4CC, 0x32DE325D, 0x9CB39C71, 0x560B56E7, 0xE372E3DA, 0x87A78760, 0x151C151B, 0xF9EFF93A, 0x63D163BF, 0x345334A9, 0x9A3E9A85, 0xB18FB142, 0x7C337CD1, 0x8826889B, 0x3D5F3DA6, 0xA1ECA1D7, 0xE476E4DF, 0x812A8194, 0x91499101, 0x0F810FFB, 0xEE88EEAA, 0x16EE1661, 0xD721D773, 0x97C497F5, 0xA51AA5A8, 0xFEEBFE3F, 0x6DD96DB5, 0x78C578AE, 0xC539C56D, 0x1D991DE5, 0x76CD76A4, 0x3EAD3EDC, 0xCB31CB67, 0xB68BB647, 0xEF01EF5B, 0x1218121E, 0x602360C5, 0x6ADD6AB0, 0x4D1F4DF6, 0xCE4ECEE9, 0xDE2DDE7C, 0x55F9559D, 0x7E487E5A, 0x214F21B2, 0x03F2037A, 0xA065A026, 0x5E8E5E19, 0x5A785A66, 0x655C654B, 0x6258624E, 0xFD19FD45, 0x068D06F4, 0x40E54086, 0xF298F2BE, 0x335733AC, 0x17671790, 0x057F058E, 0xE805E85E, 0x4F644F7D, 0x89AF896A, 0x10631095, 0x74B6742F, 0x0AFE0A75, 0x5CF55C92, 0x9BB79B74, 0x2D3C2D33, 0x30A530D6, 0x2ECE2E49, 0x49E94989, 0x46684672, 0x77447755, 0xA8E0A8D8, 0x964D9604, 0x284328BD, 0xA969A929, 0xD929D979, 0x862E8691, 0xD1ACD187, 0xF415F44A, 0x8D598D15, 0xD6A8D682, 0xB90AB9BC, 0x429E420D, 0xF66EF6C1, 0x2F472FB8, 0xDDDFDD06, 0x23342339, 0xCC35CC62, 0xF16AF1C4, 0xC1CFC112, 0x85DC85EB, 0x8F228F9E, 0x71C971A1, 0x90C090F0, 0xAA9BAA53, 0x018901F1, 0x8BD48BE1, 0x4EED4E8C, 0x8EAB8E6F, 0xAB12ABA2, 0x6FA26F3E, 0xE60DE654, 0xDB52DBF2, 0x92BB927B, 0xB702B7B6, 0x692F69CA, 0x39A939D9, 0xD3D7D30C, 0xA761A723, 0xA21EA2AD, 0xC3B4C399, 0x6C506C44, 0x07040705, 0x04F6047F, 0x27C22746, 0xAC16ACA7, 0xD025D076, 0x50865013, 0xDC56DCF7, 0x8455841A, 0xE109E151, 0x7ABE7A25, 0x139113EF}, { 0xD939A9D9, 0x90176790, 0x719CB371, 0xD2A6E8D2, 0x05070405, 0x9852FD98, 0x6580A365, 0xDFE476DF, 0x08459A08, 0x024B9202, 0xA0E080A0, 0x665A7866, 0xDDAFE4DD, 0xB06ADDB0, 0xBF63D1BF, 0x362A3836, 0x54E60D54, 0x4320C643, 0x62CC3562, 0xBEF298BE, 0x1E12181E, 0x24EBF724, 0xD7A1ECD7, 0x77416C77, 0xBD2843BD, 0x32BC7532, 0xD47B37D4, 0x9B88269B, 0x700DFA70, 0xF94413F9, 0xB1FB94B1, 0x5A7E485A, 0x7A03F27A, 0xE48CD0E4, 0x47B68B47, 0x3C24303C, 0xA5E784A5, 0x416B5441, 0x06DDDF06, 0xC56023C5, 0x45FD1945, 0xA33A5BA3, 0x68C23D68, 0x158D5915, 0x21ECF321, 0x3166AE31, 0x3E6FA23E, 0x16578216, 0x95106395, 0x5BEF015B, 0x4DB8834D, 0x91862E91, 0xB56DD9B5, 0x1F83511F, 0x53AA9B53, 0x635D7C63, 0x3B68A63B, 0x3FFEEB3F, 0xD630A5D6, 0x257ABE25, 0xA7AC16A7, 0x0F090C0F, 0x35F0E335, 0x23A76123, 0xF090C0F0, 0xAFE98CAF, 0x809D3A80, 0x925CF592, 0x810C7381, 0x27312C27, 0x76D02576, 0xE7560BE7, 0x7B92BB7B, 0xE9CE4EE9, 0xF10189F1, 0x9F1E6B9F, 0xA93453A9, 0xC4F16AC4, 0x99C3B499, 0x975BF197, 0x8347E183, 0x6B18E66B, 0xC822BDC8, 0x0E98450E, 0x6E1FE26E, 0xC9B3F4C9, 0x2F74B62F, 0xCBF866CB, 0xFF99CCFF, 0xEA1495EA, 0xED5803ED, 0xF7DC56F7, 0xE18BD4E1, 0x1B151C1B, 0xADA21EAD, 0x0CD3D70C, 0x2BE2FB2B, 0x1DC8C31D, 0x195E8E19, 0xC22CB5C2, 0x8949E989, 0x12C1CF12, 0x7E95BF7E, 0x207DBA20, 0x6411EA64, 0x840B7784, 0x6DC5396D, 0x6A89AF6A, 0xD17C33D1, 0xA171C9A1, 0xCEFF62CE, 0x37BB7137, 0xFB0F81FB, 0x3DB5793D, 0x51E10951, 0xDC3EADDC, 0x2D3F242D, 0xA476CDA4, 0x9D55F99D, 0xEE82D8EE, 0x8640E586, 0xAE78C5AE, 0xCD25B9CD, 0x04964D04, 0x55774455, 0x0A0E080A, 0x13508613, 0x30F7E730, 0xD337A1D3, 0x40FA1D40, 0x3461AA34, 0x8C4EED8C, 0xB3B006B3, 0x6C54706C, 0x2A73B22A, 0x523BD252, 0x0B9F410B, 0x8B027B8B, 0x88D8A088, 0x4FF3114F, 0x67CB3167, 0x4627C246, 0xC06727C0, 0xB4FC90B4, 0x28382028, 0x7F04F67F, 0x78486078, 0x2EE5FF2E, 0x074C9607, 0x4B655C4B, 0xC72BB1C7, 0x6F8EAB6F, 0x0D429E0D, 0xBBF59CBB, 0xF2DB52F2, 0xF34A1BF3, 0xA63D5FA6, 0x59A49359, 0xBCB90ABC, 0x3AF9EF3A, 0xEF1391EF, 0xFE0885FE, 0x01914901, 0x6116EE61, 0x7CDE2D7C, 0xB2214FB2, 0x42B18F42, 0xDB723BDB, 0xB82F47B8, 0x48BF8748, 0x2CAE6D2C, 0xE3C046E3, 0x573CD657, 0x859A3E85, 0x29A96929, 0x7D4F647D, 0x94812A94, 0x492ECE49, 0x17C6CB17, 0xCA692FCA, 0xC3BDFCC3, 0x5CA3975C, 0x5EE8055E, 0xD0ED7AD0, 0x87D1AC87, 0x8E057F8E, 0xBA64D5BA, 0xA8A51AA8, 0xB7264BB7, 0xB9BE0EB9, 0x6087A760, 0xF8D55AF8, 0x22362822, 0x111B1411, 0xDE753FDE, 0x79D92979, 0xAAEE88AA, 0x332D3C33, 0x5F794C5F, 0xB6B702B6, 0x96CAB896, 0x5835DA58, 0x9CC4B09C, 0xFC4317FC, 0x1A84551A, 0xF64D1FF6, 0x1C598A1C, 0x38B27D38, 0xAC3357AC, 0x18CFC718, 0xF4068DF4, 0x69537469, 0x749BB774, 0xF597C4F5, 0x56AD9F56, 0xDAE372DA, 0xD5EA7ED5, 0x4AF4154A, 0x9E8F229E, 0xA2AB12A2, 0x4E62584E, 0xE85F07E8, 0xE51D99E5, 0x39233439, 0xC1F66EC1, 0x446C5044, 0x5D32DE5D, 0x72466872, 0x26A06526, 0x93CDBC93, 0x03DADB03, 0xC6BAF8C6, 0xFA9EC8FA, 0x82D6A882, 0xCF6E2BCF, 0x50704050, 0xEB85DCEB, 0x750AFE75, 0x8A93328A, 0x8DDFA48D, 0x4C29CA4C, 0x141C1014, 0x73D72173, 0xCCB4F0CC, 0x09D4D309, 0x108A5D10, 0xE2510FE2, 0x00000000, 0x9A196F9A, 0xE01A9DE0, 0x8F94368F, 0xE6C742E6, 0xECC94AEC, 0xFDD25EFD, 0xAB7FC1AB, 0xD8A8E0D8} }; /* The exp_to_poly and poly_to_exp tables are used to perform efficient * operations in GF(2^8) represented as GF(2)[x]/w(x) where * w(x)=x^8+x^6+x^3+x^2+1. We care about doing that because it's part of the * definition of the RS matrix in the key schedule. Elements of that field * are polynomials of degree not greater than 7 and all coefficients 0 or 1, * which can be represented naturally by bytes (just substitute x=2). In that * form, GF(2^8) addition is the same as bitwise XOR, but GF(2^8) * multiplication is inefficient without hardware support. To multiply * faster, I make use of the fact x is a generator for the nonzero elements, * so that every element p of GF(2)[x]/w(x) is either 0 or equal to (x)^n for * some n in 0..254. Note that caret is exponentiation in GF(2^8), * *not* polynomial notation. So if I want to compute pq where p and q are * in GF(2^8), I can just say: * 1. if p=0 or q=0 then pq=0 * 2. otherwise, find m and n such that p=x^m and q=x^n * 3. pq=(x^m)(x^n)=x^(m+n), so add m and n and find pq * The translations in steps 2 and 3 are looked up in the tables * poly_to_exp (for step 2) and exp_to_poly (for step 3). To see this * in action, look at the CALC_S macro. As additional wrinkles, note that * one of my operands is always a constant, so the poly_to_exp lookup on it * is done in advance; I included the original values in the comments so * readers can have some chance of recognizing that this *is* the RS matrix * from the Twofish paper. I've only included the table entries I actually * need; I never do a lookup on a variable input of zero and the biggest * exponents I'll ever see are 254 (variable) and 237 (constant), so they'll * never sum to more than 491. I'm repeating part of the exp_to_poly table * so that I don't have to do mod-255 reduction in the exponent arithmetic. * Since I know my constant operands are never zero, I only have to worry * about zero values in the variable operand, and I do it with a simple * conditional branch. I know conditionals are expensive, but I couldn't * see a non-horrible way of avoiding them, and I did manage to group the * statements so that each if covers four group multiplications. */ static const u8 poly_to_exp[255] = { 0x00, 0x01, 0x17, 0x02, 0x2E, 0x18, 0x53, 0x03, 0x6A, 0x2F, 0x93, 0x19, 0x34, 0x54, 0x45, 0x04, 0x5C, 0x6B, 0xB6, 0x30, 0xA6, 0x94, 0x4B, 0x1A, 0x8C, 0x35, 0x81, 0x55, 0xAA, 0x46, 0x0D, 0x05, 0x24, 0x5D, 0x87, 0x6C, 0x9B, 0xB7, 0xC1, 0x31, 0x2B, 0xA7, 0xA3, 0x95, 0x98, 0x4C, 0xCA, 0x1B, 0xE6, 0x8D, 0x73, 0x36, 0xCD, 0x82, 0x12, 0x56, 0x62, 0xAB, 0xF0, 0x47, 0x4F, 0x0E, 0xBD, 0x06, 0xD4, 0x25, 0xD2, 0x5E, 0x27, 0x88, 0x66, 0x6D, 0xD6, 0x9C, 0x79, 0xB8, 0x08, 0xC2, 0xDF, 0x32, 0x68, 0x2C, 0xFD, 0xA8, 0x8A, 0xA4, 0x5A, 0x96, 0x29, 0x99, 0x22, 0x4D, 0x60, 0xCB, 0xE4, 0x1C, 0x7B, 0xE7, 0x3B, 0x8E, 0x9E, 0x74, 0xF4, 0x37, 0xD8, 0xCE, 0xF9, 0x83, 0x6F, 0x13, 0xB2, 0x57, 0xE1, 0x63, 0xDC, 0xAC, 0xC4, 0xF1, 0xAF, 0x48, 0x0A, 0x50, 0x42, 0x0F, 0xBA, 0xBE, 0xC7, 0x07, 0xDE, 0xD5, 0x78, 0x26, 0x65, 0xD3, 0xD1, 0x5F, 0xE3, 0x28, 0x21, 0x89, 0x59, 0x67, 0xFC, 0x6E, 0xB1, 0xD7, 0xF8, 0x9D, 0xF3, 0x7A, 0x3A, 0xB9, 0xC6, 0x09, 0x41, 0xC3, 0xAE, 0xE0, 0xDB, 0x33, 0x44, 0x69, 0x92, 0x2D, 0x52, 0xFE, 0x16, 0xA9, 0x0C, 0x8B, 0x80, 0xA5, 0x4A, 0x5B, 0xB5, 0x97, 0xC9, 0x2A, 0xA2, 0x9A, 0xC0, 0x23, 0x86, 0x4E, 0xBC, 0x61, 0xEF, 0xCC, 0x11, 0xE5, 0x72, 0x1D, 0x3D, 0x7C, 0xEB, 0xE8, 0xE9, 0x3C, 0xEA, 0x8F, 0x7D, 0x9F, 0xEC, 0x75, 0x1E, 0xF5, 0x3E, 0x38, 0xF6, 0xD9, 0x3F, 0xCF, 0x76, 0xFA, 0x1F, 0x84, 0xA0, 0x70, 0xED, 0x14, 0x90, 0xB3, 0x7E, 0x58, 0xFB, 0xE2, 0x20, 0x64, 0xD0, 0xDD, 0x77, 0xAD, 0xDA, 0xC5, 0x40, 0xF2, 0x39, 0xB0, 0xF7, 0x49, 0xB4, 0x0B, 0x7F, 0x51, 0x15, 0x43, 0x91, 0x10, 0x71, 0xBB, 0xEE, 0xBF, 0x85, 0xC8, 0xA1 }; static const u8 exp_to_poly[492] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2, 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6, 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52, 0xA4, 0x05, 0x0A, 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0, 0xED, 0x97, 0x63, 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1, 0x0F, 0x1E, 0x3C, 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A, 0xF4, 0xA5, 0x07, 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11, 0x22, 0x44, 0x88, 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51, 0xA2, 0x09, 0x12, 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66, 0xCC, 0xD5, 0xE7, 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB, 0x1B, 0x36, 0x6C, 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19, 0x32, 0x64, 0xC8, 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D, 0x5A, 0xB4, 0x25, 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56, 0xAC, 0x15, 0x2A, 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE, 0x91, 0x6F, 0xDE, 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9, 0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE, 0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41, 0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E, 0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB, 0xDB, 0xFB, 0xBB, 0x3B, 0x76, 0xEC, 0x95, 0x67, 0xCE, 0xD1, 0xEF, 0x93, 0x6B, 0xD6, 0xE1, 0x8F, 0x53, 0xA6, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x4D, 0x9A, 0x79, 0xF2, 0xA9, 0x1F, 0x3E, 0x7C, 0xF8, 0xBD, 0x37, 0x6E, 0xDC, 0xF5, 0xA7, 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0xC0, 0xCD, 0xD7, 0xE3, 0x8B, 0x5B, 0xB6, 0x21, 0x42, 0x84, 0x45, 0x8A, 0x59, 0xB2, 0x29, 0x52, 0xA4, 0x05, 0x0A, 0x14, 0x28, 0x50, 0xA0, 0x0D, 0x1A, 0x34, 0x68, 0xD0, 0xED, 0x97, 0x63, 0xC6, 0xC1, 0xCF, 0xD3, 0xEB, 0x9B, 0x7B, 0xF6, 0xA1, 0x0F, 0x1E, 0x3C, 0x78, 0xF0, 0xAD, 0x17, 0x2E, 0x5C, 0xB8, 0x3D, 0x7A, 0xF4, 0xA5, 0x07, 0x0E, 0x1C, 0x38, 0x70, 0xE0, 0x8D, 0x57, 0xAE, 0x11, 0x22, 0x44, 0x88, 0x5D, 0xBA, 0x39, 0x72, 0xE4, 0x85, 0x47, 0x8E, 0x51, 0xA2, 0x09, 0x12, 0x24, 0x48, 0x90, 0x6D, 0xDA, 0xF9, 0xBF, 0x33, 0x66, 0xCC, 0xD5, 0xE7, 0x83, 0x4B, 0x96, 0x61, 0xC2, 0xC9, 0xDF, 0xF3, 0xAB, 0x1B, 0x36, 0x6C, 0xD8, 0xFD, 0xB7, 0x23, 0x46, 0x8C, 0x55, 0xAA, 0x19, 0x32, 0x64, 0xC8, 0xDD, 0xF7, 0xA3, 0x0B, 0x16, 0x2C, 0x58, 0xB0, 0x2D, 0x5A, 0xB4, 0x25, 0x4A, 0x94, 0x65, 0xCA, 0xD9, 0xFF, 0xB3, 0x2B, 0x56, 0xAC, 0x15, 0x2A, 0x54, 0xA8, 0x1D, 0x3A, 0x74, 0xE8, 0x9D, 0x77, 0xEE, 0x91, 0x6F, 0xDE, 0xF1, 0xAF, 0x13, 0x26, 0x4C, 0x98, 0x7D, 0xFA, 0xB9, 0x3F, 0x7E, 0xFC, 0xB5, 0x27, 0x4E, 0x9C, 0x75, 0xEA, 0x99, 0x7F, 0xFE, 0xB1, 0x2F, 0x5E, 0xBC, 0x35, 0x6A, 0xD4, 0xE5, 0x87, 0x43, 0x86, 0x41, 0x82, 0x49, 0x92, 0x69, 0xD2, 0xE9, 0x9F, 0x73, 0xE6, 0x81, 0x4F, 0x9E, 0x71, 0xE2, 0x89, 0x5F, 0xBE, 0x31, 0x62, 0xC4, 0xC5, 0xC7, 0xC3, 0xCB }; /* The table constants are indices of * S-box entries, preprocessed through q0 and q1. */ static const u8 calc_sb_tbl[512] = { 0xA9, 0x75, 0x67, 0xF3, 0xB3, 0xC6, 0xE8, 0xF4, 0x04, 0xDB, 0xFD, 0x7B, 0xA3, 0xFB, 0x76, 0xC8, 0x9A, 0x4A, 0x92, 0xD3, 0x80, 0xE6, 0x78, 0x6B, 0xE4, 0x45, 0xDD, 0x7D, 0xD1, 0xE8, 0x38, 0x4B, 0x0D, 0xD6, 0xC6, 0x32, 0x35, 0xD8, 0x98, 0xFD, 0x18, 0x37, 0xF7, 0x71, 0xEC, 0xF1, 0x6C, 0xE1, 0x43, 0x30, 0x75, 0x0F, 0x37, 0xF8, 0x26, 0x1B, 0xFA, 0x87, 0x13, 0xFA, 0x94, 0x06, 0x48, 0x3F, 0xF2, 0x5E, 0xD0, 0xBA, 0x8B, 0xAE, 0x30, 0x5B, 0x84, 0x8A, 0x54, 0x00, 0xDF, 0xBC, 0x23, 0x9D, 0x19, 0x6D, 0x5B, 0xC1, 0x3D, 0xB1, 0x59, 0x0E, 0xF3, 0x80, 0xAE, 0x5D, 0xA2, 0xD2, 0x82, 0xD5, 0x63, 0xA0, 0x01, 0x84, 0x83, 0x07, 0x2E, 0x14, 0xD9, 0xB5, 0x51, 0x90, 0x9B, 0x2C, 0x7C, 0xA3, 0xA6, 0xB2, 0xEB, 0x73, 0xA5, 0x4C, 0xBE, 0x54, 0x16, 0x92, 0x0C, 0x74, 0xE3, 0x36, 0x61, 0x51, 0xC0, 0x38, 0x8C, 0xB0, 0x3A, 0xBD, 0xF5, 0x5A, 0x73, 0xFC, 0x2C, 0x60, 0x25, 0x62, 0x0B, 0x96, 0xBB, 0x6C, 0x4E, 0x42, 0x89, 0xF7, 0x6B, 0x10, 0x53, 0x7C, 0x6A, 0x28, 0xB4, 0x27, 0xF1, 0x8C, 0xE1, 0x13, 0xE6, 0x95, 0xBD, 0x9C, 0x45, 0xC7, 0xE2, 0x24, 0xF4, 0x46, 0xB6, 0x3B, 0x66, 0x70, 0xCC, 0xCA, 0x95, 0xE3, 0x03, 0x85, 0x56, 0xCB, 0xD4, 0x11, 0x1C, 0xD0, 0x1E, 0x93, 0xD7, 0xB8, 0xFB, 0xA6, 0xC3, 0x83, 0x8E, 0x20, 0xB5, 0xFF, 0xE9, 0x9F, 0xCF, 0x77, 0xBF, 0xC3, 0xBA, 0xCC, 0xEA, 0x03, 0x77, 0x6F, 0x39, 0x08, 0xAF, 0xBF, 0x33, 0x40, 0xC9, 0xE7, 0x62, 0x2B, 0x71, 0xE2, 0x81, 0x79, 0x79, 0x0C, 0x09, 0xAA, 0xAD, 0x82, 0x24, 0x41, 0xCD, 0x3A, 0xF9, 0xEA, 0xD8, 0xB9, 0xE5, 0xE4, 0xC5, 0x9A, 0xB9, 0xA4, 0x4D, 0x97, 0x44, 0x7E, 0x08, 0xDA, 0x86, 0x7A, 0xE7, 0x17, 0xA1, 0x66, 0x1D, 0x94, 0xAA, 0xA1, 0xED, 0x1D, 0x06, 0x3D, 0x70, 0xF0, 0xB2, 0xDE, 0xD2, 0xB3, 0x41, 0x0B, 0x7B, 0x72, 0xA0, 0xA7, 0x11, 0x1C, 0x31, 0xEF, 0xC2, 0xD1, 0x27, 0x53, 0x90, 0x3E, 0x20, 0x8F, 0xF6, 0x33, 0x60, 0x26, 0xFF, 0x5F, 0x96, 0xEC, 0x5C, 0x76, 0xB1, 0x2A, 0xAB, 0x49, 0x9E, 0x81, 0x9C, 0x88, 0x52, 0xEE, 0x1B, 0x21, 0x5F, 0xC4, 0x93, 0x1A, 0x0A, 0xEB, 0xEF, 0xD9, 0x91, 0xC5, 0x85, 0x39, 0x49, 0x99, 0xEE, 0xCD, 0x2D, 0xAD, 0x4F, 0x31, 0x8F, 0x8B, 0x3B, 0x01, 0x47, 0x18, 0x87, 0x23, 0x6D, 0xDD, 0x46, 0x1F, 0xD6, 0x4E, 0x3E, 0x2D, 0x69, 0xF9, 0x64, 0x48, 0x2A, 0x4F, 0xCE, 0xF2, 0xCB, 0x65, 0x2F, 0x8E, 0xFC, 0x78, 0x97, 0x5C, 0x05, 0x58, 0x7A, 0x19, 0xAC, 0x8D, 0x7F, 0xE5, 0xD5, 0x98, 0x1A, 0x57, 0x4B, 0x67, 0x0E, 0x7F, 0xA7, 0x05, 0x5A, 0x64, 0x28, 0xAF, 0x14, 0x63, 0x3F, 0xB6, 0x29, 0xFE, 0x88, 0xF5, 0x3C, 0xB7, 0x4C, 0x3C, 0x02, 0xA5, 0xB8, 0xCE, 0xDA, 0xE9, 0xB0, 0x68, 0x17, 0x44, 0x55, 0xE0, 0x1F, 0x4D, 0x8A, 0x43, 0x7D, 0x69, 0x57, 0x29, 0xC7, 0x2E, 0x8D, 0xAC, 0x74, 0x15, 0xB7, 0x59, 0xC4, 0xA8, 0x9F, 0x0A, 0x72, 0x9E, 0x7E, 0x6E, 0x15, 0x47, 0x22, 0xDF, 0x12, 0x34, 0x58, 0x35, 0x07, 0x6A, 0x99, 0xCF, 0x34, 0xDC, 0x6E, 0x22, 0x50, 0xC9, 0xDE, 0xC0, 0x68, 0x9B, 0x65, 0x89, 0xBC, 0xD4, 0xDB, 0xED, 0xF8, 0xAB, 0xC8, 0x12, 0xA8, 0xA2, 0x2B, 0x0D, 0x40, 0x52, 0xDC, 0xBB, 0xFE, 0x02, 0x32, 0x2F, 0xA4, 0xA9, 0xCA, 0xD7, 0x10, 0x61, 0x21, 0x1E, 0xF0, 0xB4, 0xD3, 0x50, 0x5D, 0x04, 0x0F, 0xF6, 0x00, 0xC2, 0x6F, 0x16, 0x9D, 0x25, 0x36, 0x86, 0x42, 0x56, 0x4A, 0x55, 0x5E, 0x09, 0xC1, 0xBE, 0xE0, 0x91 }; /* Macro to perform one column of the RS matrix multiplication. The * parameters a, b, c, and d are the four bytes of output; i is the index * of the key bytes, and w, x, y, and z, are the column of constants from * the RS matrix, preprocessed through the poly_to_exp table. */ #define CALC_S(a, b, c, d, i, w, x, y, z) \ if (key[i]) { \ tmp = poly_to_exp[key[i] - 1]; \ (a) ^= exp_to_poly[tmp + (w)]; \ (b) ^= exp_to_poly[tmp + (x)]; \ (c) ^= exp_to_poly[tmp + (y)]; \ (d) ^= exp_to_poly[tmp + (z)]; \ } /* Macros to calculate the key-dependent S-boxes for a 128-bit key using * the S vector from CALC_S. CALC_SB_2 computes a single entry in all * four S-boxes, where i is the index of the entry to compute, and a and b * are the index numbers preprocessed through the q0 and q1 tables * respectively. */ #define CALC_SB_2(i, a, b) \ ctx->s[0][i] = mds[0][q0[(a) ^ sa] ^ se]; \ ctx->s[1][i] = mds[1][q0[(b) ^ sb] ^ sf]; \ ctx->s[2][i] = mds[2][q1[(a) ^ sc] ^ sg]; \ ctx->s[3][i] = mds[3][q1[(b) ^ sd] ^ sh] /* Macro exactly like CALC_SB_2, but for 192-bit keys. */ #define CALC_SB192_2(i, a, b) \ ctx->s[0][i] = mds[0][q0[q0[(b) ^ sa] ^ se] ^ si]; \ ctx->s[1][i] = mds[1][q0[q1[(b) ^ sb] ^ sf] ^ sj]; \ ctx->s[2][i] = mds[2][q1[q0[(a) ^ sc] ^ sg] ^ sk]; \ ctx->s[3][i] = mds[3][q1[q1[(a) ^ sd] ^ sh] ^ sl]; /* Macro exactly like CALC_SB_2, but for 256-bit keys. */ #define CALC_SB256_2(i, a, b) \ ctx->s[0][i] = mds[0][q0[q0[q1[(b) ^ sa] ^ se] ^ si] ^ sm]; \ ctx->s[1][i] = mds[1][q0[q1[q1[(a) ^ sb] ^ sf] ^ sj] ^ sn]; \ ctx->s[2][i] = mds[2][q1[q0[q0[(a) ^ sc] ^ sg] ^ sk] ^ so]; \ ctx->s[3][i] = mds[3][q1[q1[q0[(b) ^ sd] ^ sh] ^ sl] ^ sp]; /* Macros to calculate the whitening and round subkeys. CALC_K_2 computes the * last two stages of the h() function for a given index (either 2i or 2i+1). * a, b, c, and d are the four bytes going into the last two stages. For * 128-bit keys, this is the entire h() function and a and c are the index * preprocessed through q0 and q1 respectively; for longer keys they are the * output of previous stages. j is the index of the first key byte to use. * CALC_K computes a pair of subkeys for 128-bit Twofish, by calling CALC_K_2 * twice, doing the Pseudo-Hadamard Transform, and doing the necessary * rotations. Its parameters are: a, the array to write the results into, * j, the index of the first output entry, k and l, the preprocessed indices * for index 2i, and m and n, the preprocessed indices for index 2i+1. * CALC_K192_2 expands CALC_K_2 to handle 192-bit keys, by doing an * additional lookup-and-XOR stage. The parameters a, b, c and d are the * four bytes going into the last three stages. For 192-bit keys, c = d * are the index preprocessed through q0, and a = b are the index * preprocessed through q1; j is the index of the first key byte to use. * CALC_K192 is identical to CALC_K but for using the CALC_K192_2 macro * instead of CALC_K_2. * CALC_K256_2 expands CALC_K192_2 to handle 256-bit keys, by doing an * additional lookup-and-XOR stage. The parameters a and b are the index * preprocessed through q0 and q1 respectively; j is the index of the first * key byte to use. CALC_K256 is identical to CALC_K but for using the * CALC_K256_2 macro instead of CALC_K_2. */ #define CALC_K_2(a, b, c, d, j) \ mds[0][q0[a ^ key[(j) + 8]] ^ key[j]] \ ^ mds[1][q0[b ^ key[(j) + 9]] ^ key[(j) + 1]] \ ^ mds[2][q1[c ^ key[(j) + 10]] ^ key[(j) + 2]] \ ^ mds[3][q1[d ^ key[(j) + 11]] ^ key[(j) + 3]] #define CALC_K(a, j, k, l, m, n) \ x = CALC_K_2 (k, l, k, l, 0); \ y = CALC_K_2 (m, n, m, n, 4); \ y = rol32(y, 8); \ x += y; y += x; ctx->a[j] = x; \ ctx->a[(j) + 1] = rol32(y, 9) #define CALC_K192_2(a, b, c, d, j) \ CALC_K_2 (q0[a ^ key[(j) + 16]], \ q1[b ^ key[(j) + 17]], \ q0[c ^ key[(j) + 18]], \ q1[d ^ key[(j) + 19]], j) #define CALC_K192(a, j, k, l, m, n) \ x = CALC_K192_2 (l, l, k, k, 0); \ y = CALC_K192_2 (n, n, m, m, 4); \ y = rol32(y, 8); \ x += y; y += x; ctx->a[j] = x; \ ctx->a[(j) + 1] = rol32(y, 9) #define CALC_K256_2(a, b, j) \ CALC_K192_2 (q1[b ^ key[(j) + 24]], \ q1[a ^ key[(j) + 25]], \ q0[a ^ key[(j) + 26]], \ q0[b ^ key[(j) + 27]], j) #define CALC_K256(a, j, k, l, m, n) \ x = CALC_K256_2 (k, l, 0); \ y = CALC_K256_2 (m, n, 4); \ y = rol32(y, 8); \ x += y; y += x; ctx->a[j] = x; \ ctx->a[(j) + 1] = rol32(y, 9) /* Perform the key setup. */ int __twofish_setkey(struct twofish_ctx *ctx, const u8 *key, unsigned int key_len) { int i, j, k; /* Temporaries for CALC_K. */ u32 x, y; /* The S vector used to key the S-boxes, split up into individual bytes. * 128-bit keys use only sa through sh; 256-bit use all of them. */ u8 sa = 0, sb = 0, sc = 0, sd = 0, se = 0, sf = 0, sg = 0, sh = 0; u8 si = 0, sj = 0, sk = 0, sl = 0, sm = 0, sn = 0, so = 0, sp = 0; /* Temporary for CALC_S. */ u8 tmp; /* Check key length. */ if (key_len % 8) return -EINVAL; /* unsupported key length */ /* Compute the first two words of the S vector. The magic numbers are * the entries of the RS matrix, preprocessed through poly_to_exp. The * numbers in the comments are the original (polynomial form) matrix * entries. */ CALC_S (sa, sb, sc, sd, 0, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */ CALC_S (sa, sb, sc, sd, 1, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */ CALC_S (sa, sb, sc, sd, 2, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */ CALC_S (sa, sb, sc, sd, 3, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */ CALC_S (sa, sb, sc, sd, 4, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */ CALC_S (sa, sb, sc, sd, 5, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */ CALC_S (sa, sb, sc, sd, 6, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */ CALC_S (sa, sb, sc, sd, 7, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */ CALC_S (se, sf, sg, sh, 8, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */ CALC_S (se, sf, sg, sh, 9, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */ CALC_S (se, sf, sg, sh, 10, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */ CALC_S (se, sf, sg, sh, 11, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */ CALC_S (se, sf, sg, sh, 12, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */ CALC_S (se, sf, sg, sh, 13, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */ CALC_S (se, sf, sg, sh, 14, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */ CALC_S (se, sf, sg, sh, 15, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */ if (key_len == 24 || key_len == 32) { /* 192- or 256-bit key */ /* Calculate the third word of the S vector */ CALC_S (si, sj, sk, sl, 16, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */ CALC_S (si, sj, sk, sl, 17, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */ CALC_S (si, sj, sk, sl, 18, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */ CALC_S (si, sj, sk, sl, 19, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */ CALC_S (si, sj, sk, sl, 20, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */ CALC_S (si, sj, sk, sl, 21, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */ CALC_S (si, sj, sk, sl, 22, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */ CALC_S (si, sj, sk, sl, 23, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */ } if (key_len == 32) { /* 256-bit key */ /* Calculate the fourth word of the S vector */ CALC_S (sm, sn, so, sp, 24, 0x00, 0x2D, 0x01, 0x2D); /* 01 A4 02 A4 */ CALC_S (sm, sn, so, sp, 25, 0x2D, 0xA4, 0x44, 0x8A); /* A4 56 A1 55 */ CALC_S (sm, sn, so, sp, 26, 0x8A, 0xD5, 0xBF, 0xD1); /* 55 82 FC 87 */ CALC_S (sm, sn, so, sp, 27, 0xD1, 0x7F, 0x3D, 0x99); /* 87 F3 C1 5A */ CALC_S (sm, sn, so, sp, 28, 0x99, 0x46, 0x66, 0x96); /* 5A 1E 47 58 */ CALC_S (sm, sn, so, sp, 29, 0x96, 0x3C, 0x5B, 0xED); /* 58 C6 AE DB */ CALC_S (sm, sn, so, sp, 30, 0xED, 0x37, 0x4F, 0xE0); /* DB 68 3D 9E */ CALC_S (sm, sn, so, sp, 31, 0xE0, 0xD0, 0x8C, 0x17); /* 9E E5 19 03 */ /* Compute the S-boxes. */ for ( i = j = 0, k = 1; i < 256; i++, j += 2, k += 2 ) { CALC_SB256_2( i, calc_sb_tbl[j], calc_sb_tbl[k] ); } /* CALC_K256/CALC_K192/CALC_K loops were unrolled. * Unrolling produced x2.5 more code (+18k on i386), * and speeded up key setup by 7%: * unrolled: twofish_setkey/sec: 41128 * loop: twofish_setkey/sec: 38148 * CALC_K256: ~100 insns each * CALC_K192: ~90 insns * CALC_K: ~70 insns */ /* Calculate whitening and round subkeys */ for ( i = 0; i < 8; i += 2 ) { CALC_K256 (w, i, q0[i], q1[i], q0[i+1], q1[i+1]); } for ( i = 0; i < 32; i += 2 ) { CALC_K256 (k, i, q0[i+8], q1[i+8], q0[i+9], q1[i+9]); } } else if (key_len == 24) { /* 192-bit key */ /* Compute the S-boxes. */ for ( i = j = 0, k = 1; i < 256; i++, j += 2, k += 2 ) { CALC_SB192_2( i, calc_sb_tbl[j], calc_sb_tbl[k] ); } /* Calculate whitening and round subkeys */ for ( i = 0; i < 8; i += 2 ) { CALC_K192 (w, i, q0[i], q1[i], q0[i+1], q1[i+1]); } for ( i = 0; i < 32; i += 2 ) { CALC_K192 (k, i, q0[i+8], q1[i+8], q0[i+9], q1[i+9]); } } else { /* 128-bit key */ /* Compute the S-boxes. */ for ( i = j = 0, k = 1; i < 256; i++, j += 2, k += 2 ) { CALC_SB_2( i, calc_sb_tbl[j], calc_sb_tbl[k] ); } /* Calculate whitening and round subkeys */ for ( i = 0; i < 8; i += 2 ) { CALC_K (w, i, q0[i], q1[i], q0[i+1], q1[i+1]); } for ( i = 0; i < 32; i += 2 ) { CALC_K (k, i, q0[i+8], q1[i+8], q0[i+9], q1[i+9]); } } return 0; } EXPORT_SYMBOL_GPL(__twofish_setkey); int twofish_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { return __twofish_setkey(crypto_tfm_ctx(tfm), key, key_len); } EXPORT_SYMBOL_GPL(twofish_setkey); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Twofish cipher common functions"); |
5 5 5 1 5 5 5 5 2 5 5 3 5 5 5 5 5 1 1 1 1 1 2 1 1 2 2 5 5 3 3 3 2 5 5 3 3 2 2 2 2 2 2 2 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 | // SPDX-License-Identifier: GPL-2.0-or-later /* Virtio ring implementation. * * Copyright 2007 Rusty Russell IBM Corporation */ #include <linux/virtio.h> #include <linux/virtio_ring.h> #include <linux/virtio_config.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/hrtimer.h> #include <linux/dma-mapping.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <xen/xen.h> #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ do { \ dev_err(&(_vq)->vq.vdev->dev, \ "%s:"fmt, (_vq)->vq.name, ##args); \ BUG(); \ } while (0) /* Caller is supposed to guarantee no reentry. */ #define START_USE(_vq) \ do { \ if ((_vq)->in_use) \ panic("%s:in_use = %i\n", \ (_vq)->vq.name, (_vq)->in_use); \ (_vq)->in_use = __LINE__; \ } while (0) #define END_USE(_vq) \ do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) #define LAST_ADD_TIME_UPDATE(_vq) \ do { \ ktime_t now = ktime_get(); \ \ /* No kick or get, with .1 second between? Warn. */ \ if ((_vq)->last_add_time_valid) \ WARN_ON(ktime_to_ms(ktime_sub(now, \ (_vq)->last_add_time)) > 100); \ (_vq)->last_add_time = now; \ (_vq)->last_add_time_valid = true; \ } while (0) #define LAST_ADD_TIME_CHECK(_vq) \ do { \ if ((_vq)->last_add_time_valid) { \ WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \ (_vq)->last_add_time)) > 100); \ } \ } while (0) #define LAST_ADD_TIME_INVALID(_vq) \ ((_vq)->last_add_time_valid = false) #else #define BAD_RING(_vq, fmt, args...) \ do { \ dev_err(&_vq->vq.vdev->dev, \ "%s:"fmt, (_vq)->vq.name, ##args); \ (_vq)->broken = true; \ } while (0) #define START_USE(vq) #define END_USE(vq) #define LAST_ADD_TIME_UPDATE(vq) #define LAST_ADD_TIME_CHECK(vq) #define LAST_ADD_TIME_INVALID(vq) #endif struct vring_desc_state_split { void *data; /* Data for callback. */ /* Indirect desc table and extra table, if any. These two will be * allocated together. So we won't stress more to the memory allocator. */ struct vring_desc *indir_desc; }; struct vring_desc_state_packed { void *data; /* Data for callback. */ /* Indirect desc table and extra table, if any. These two will be * allocated together. So we won't stress more to the memory allocator. */ struct vring_packed_desc *indir_desc; u16 num; /* Descriptor list length. */ u16 last; /* The last desc state in a list. */ }; struct vring_desc_extra { dma_addr_t addr; /* Descriptor DMA addr. */ u32 len; /* Descriptor length. */ u16 flags; /* Descriptor flags. */ u16 next; /* The next desc state in a list. */ }; struct vring_virtqueue_split { /* Actual memory layout for this queue. */ struct vring vring; /* Last written value to avail->flags */ u16 avail_flags_shadow; /* * Last written value to avail->idx in * guest byte order. */ u16 avail_idx_shadow; /* Per-descriptor state. */ struct vring_desc_state_split *desc_state; struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t queue_dma_addr; size_t queue_size_in_bytes; /* * The parameters for creating vrings are reserved for creating new * vring. */ u32 vring_align; bool may_reduce_num; }; struct vring_virtqueue_packed { /* Actual memory layout for this queue. */ struct { unsigned int num; struct vring_packed_desc *desc; struct vring_packed_desc_event *driver; struct vring_packed_desc_event *device; } vring; /* Driver ring wrap counter. */ bool avail_wrap_counter; /* Avail used flags. */ u16 avail_used_flags; /* Index of the next avail descriptor. */ u16 next_avail_idx; /* * Last written value to driver->flags in * guest byte order. */ u16 event_flags_shadow; /* Per-descriptor state. */ struct vring_desc_state_packed *desc_state; struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t ring_dma_addr; dma_addr_t driver_event_dma_addr; dma_addr_t device_event_dma_addr; size_t ring_size_in_bytes; size_t event_size_in_bytes; }; struct vring_virtqueue { struct virtqueue vq; /* Is this a packed ring? */ bool packed_ring; /* Is DMA API used? */ bool use_dma_api; /* Can we use weak barriers? */ bool weak_barriers; /* Other side has made a mess, don't try any more. */ bool broken; /* Host supports indirect buffers */ bool indirect; /* Host publishes avail event idx */ bool event; /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. * for split ring, it just contains last used index * for packed ring: * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index. * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap counter. */ u16 last_used_idx; /* Hint for event idx: already triggered no need to disable. */ bool event_triggered; union { /* Available for split ring */ struct vring_virtqueue_split split; /* Available for packed ring */ struct vring_virtqueue_packed packed; }; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); /* DMA, allocation, and size information */ bool we_own_ring; /* Device used for doing DMA */ struct device *dma_dev; #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; /* Figure out if their kicks are too delayed. */ bool last_add_time_valid; ktime_t last_add_time; #endif }; static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num); static void vring_free(struct virtqueue *_vq); /* * Helpers. */ #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, unsigned int total_sg) { /* * If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ return (vq->indirect && total_sg > 1 && vq->vq.num_free); } /* * Modern virtio devices have feature bits to specify whether they need a * quirk and bypass the IOMMU. If not there, just use the DMA API. * * If there, the interaction between virtio and DMA API is messy. * * On most systems with virtio, physical addresses match bus addresses, * and it doesn't particularly matter whether we use the DMA API. * * On some systems, including Xen and any system with a physical device * that speaks virtio behind a physical IOMMU, we must use the DMA API * for virtio DMA to work at all. * * On other systems, including SPARC and PPC64, virtio-pci devices are * enumerated as though they are behind an IOMMU, but the virtio host * ignores the IOMMU, so we must either pretend that the IOMMU isn't * there or somehow map everything as the identity. * * For the time being, we preserve historic behavior and bypass the DMA * API. * * TODO: install a per-device DMA ops structure that does the right thing * taking into account all the above quirks, and use the DMA API * unconditionally on data path. */ static bool vring_use_dma_api(const struct virtio_device *vdev) { if (!virtio_has_dma_quirk(vdev)) return true; /* Otherwise, we are left to guess. */ /* * In theory, it's possible to have a buggy QEMU-supposed * emulated Q35 IOMMU and Xen enabled at the same time. On * such a configuration, virtio has never worked and will * not work without an even larger kludge. Instead, enable * the DMA API if we're a Xen guest, which at least allows * all of the sensible Xen configurations to work correctly. */ if (xen_domain()) return true; return false; } static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring, const struct vring_desc_extra *extra) { return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR); } size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; if (vring_use_dma_api(vdev)) max_segment_size = dma_max_mapping_size(vdev->dev.parent); return max_segment_size; } EXPORT_SYMBOL_GPL(virtio_max_dma_size); static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, struct device *dma_dev) { if (vring_use_dma_api(vdev)) { return dma_alloc_coherent(dma_dev, size, dma_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); if (queue) { phys_addr_t phys_addr = virt_to_phys(queue); *dma_handle = (dma_addr_t)phys_addr; /* * Sanity check: make sure we dind't truncate * the address. The only arches I can find that * have 64-bit phys_addr_t but 32-bit dma_addr_t * are certain non-highmem MIPS and x86 * configurations, but these configurations * should never allocate physical pages above 32 * bits, so this is fine. Just in case, throw a * warning and abort if we end up with an * unrepresentable address. */ if (WARN_ON_ONCE(*dma_handle != phys_addr)) { free_pages_exact(queue, PAGE_ALIGN(size)); return NULL; } } return queue; } } static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, struct device *dma_dev) { if (vring_use_dma_api(vdev)) dma_free_coherent(dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } /* * The DMA ops on various arches are rather gnarly right now, and * making all of the arch DMA ops work on the vring device itself * is a mess. */ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) { return vq->dma_dev; } /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, u32 *len, bool premapped) { if (premapped) { *addr = sg_dma_address(sg); *len = sg_dma_len(sg); return 0; } *len = sg->length; if (!vq->use_dma_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist * is initialized by the hardware. Explicitly check/unpoison it * depending on the direction. */ kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); *addr = (dma_addr_t)sg_phys(sg); return 0; } /* * We can't use dma_map_sg, because we don't use scatterlists in * the way it expects (we don't guarantee that the scatterlist * will exist for the lifetime of the mapping). */ *addr = dma_map_page(vring_dma_dev(vq), sg_page(sg), sg->offset, sg->length, direction); if (dma_mapping_error(vring_dma_dev(vq), *addr)) return -ENOMEM; return 0; } static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (!vq->use_dma_api) return (dma_addr_t)virt_to_phys(cpu_addr); return dma_map_single(vring_dma_dev(vq), cpu_addr, size, direction); } static int vring_mapping_error(const struct vring_virtqueue *vq, dma_addr_t addr) { if (!vq->use_dma_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; if (vq->packed_ring) vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); else vq->last_used_idx = 0; vq->event_triggered = false; vq->num_added = 0; #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; #endif } /* * Split ring specific functions - *_split(). */ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, struct vring_desc_extra *extra) { u16 flags; flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { if (!vq->use_dma_api) goto out; dma_unmap_single(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { if (!vring_need_unmap_buffer(vq, extra)) goto out; dma_unmap_page(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } out: return extra->next; } static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) { struct vring_desc_extra *extra; struct vring_desc *desc; unsigned int i, size; /* * We require lowmem mappings for the descriptors because * otherwise virt_to_phys will give us bogus addresses in the * virtqueue. */ gfp &= ~__GFP_HIGHMEM; size = sizeof(*desc) * total_sg + sizeof(*extra) * total_sg; desc = kmalloc(size, gfp); if (!desc) return NULL; extra = (struct vring_desc_extra *)&desc[total_sg]; for (i = 0; i < total_sg; i++) extra[i].next = i + 1; return desc; } static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, struct vring_desc *desc, struct vring_desc_extra *extra, unsigned int i, dma_addr_t addr, unsigned int len, u16 flags, bool premapped) { u16 next; desc[i].flags = cpu_to_virtio16(vq->vdev, flags); desc[i].addr = cpu_to_virtio64(vq->vdev, addr); desc[i].len = cpu_to_virtio32(vq->vdev, len); extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; extra[i].flags = flags; next = extra[i].next; desc[i].next = cpu_to_virtio16(vq->vdev, next); return next; } static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct vring_desc_extra *extra; struct scatterlist *sg; struct vring_desc *desc; unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; START_USE(vq); BUG_ON(data == NULL); BUG_ON(ctx && vq->indirect); if (unlikely(vq->broken)) { END_USE(vq); return -EIO; } LAST_ADD_TIME_UPDATE(vq); BUG_ON(total_sg == 0); head = vq->free_head; if (virtqueue_use_indirect(vq, total_sg)) desc = alloc_indirect_split(_vq, total_sg, gfp); else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); } if (desc) { /* Use a single buffer which doesn't continue */ indirect = true; /* Set up rest to use this indirect table. */ i = 0; descs_used = 1; extra = (struct vring_desc_extra *)&desc[total_sg]; } else { indirect = false; desc = vq->split.vring.desc; extra = vq->split.desc_extra; i = head; descs_used = total_sg; } if (unlikely(vq->vq.num_free < descs_used)) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ if (out_sgs) vq->notify(&vq->vq); if (indirect) kfree(desc); END_USE(vq); return -ENOSPC; } for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, VRING_DESC_F_NEXT, premapped); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, premapped); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ dma_addr_t addr = vring_map_single( vq, desc, total_sg * sizeof(struct vring_desc), DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) goto unmap_release; virtqueue_add_desc_split(_vq, vq->split.vring.desc, vq->split.desc_extra, head, addr, total_sg * sizeof(struct vring_desc), VRING_DESC_F_INDIRECT, false); } /* We're using some buffers from the free list. */ vq->vq.num_free -= descs_used; /* Update free pointer */ if (indirect) vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; /* Store token and indirect buffer state. */ vq->split.desc_state[head].data = data; if (indirect) vq->split.desc_state[head].indir_desc = desc; else vq->split.desc_state[head].indir_desc = ctx; /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); /* This is very unlikely, but theoretically possible. Kick * just in case. */ if (unlikely(vq->num_added == (1 << 16) - 1)) virtqueue_kick(_vq); return 0; unmap_release: err_idx = i; if (indirect) i = 0; else i = head; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; i = vring_unmap_one_split(vq, &extra[i]); } if (indirect) kfree(desc); END_USE(vq); return -ENOMEM; } static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old; bool needs_kick; START_USE(vq); /* We need to expose available array entries before checking avail * event. */ virtio_mb(vq->weak_barriers); old = vq->split.avail_idx_shadow - vq->num_added; new = vq->split.avail_idx_shadow; vq->num_added = 0; LAST_ADD_TIME_CHECK(vq); LAST_ADD_TIME_INVALID(vq); if (vq->event) { needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->split.vring)), new, old); } else { needs_kick = !(vq->split.vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY)); } END_USE(vq); return needs_kick; } static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, void **ctx) { struct vring_desc_extra *extra; unsigned int i, j; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ vq->split.desc_state[head].data = NULL; extra = vq->split.desc_extra; /* Put back on free list: unmap first-level descriptors and find end */ i = head; while (vq->split.vring.desc[i].flags & nextflag) { vring_unmap_one_split(vq, &extra[i]); i = vq->split.desc_extra[i].next; vq->vq.num_free++; } vring_unmap_one_split(vq, &extra[i]); vq->split.desc_extra[i].next = vq->free_head; vq->free_head = head; /* Plus final descriptor */ vq->vq.num_free++; if (vq->indirect) { struct vring_desc *indir_desc = vq->split.desc_state[head].indir_desc; u32 len, num; /* Free the indirect table, if any, now that it's unmapped. */ if (!indir_desc) return; len = vq->split.desc_extra[head].len; BUG_ON(!(vq->split.desc_extra[head].flags & VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); num = len / sizeof(struct vring_desc); extra = (struct vring_desc_extra *)&indir_desc[num]; if (vq->use_dma_api) { for (j = 0; j < num; j++) vring_unmap_one_split(vq, &extra[j]); } kfree(indir_desc); vq->split.desc_state[head].indir_desc = NULL; } else if (ctx) { *ctx = vq->split.desc_state[head].indir_desc; } } static bool more_used_split(const struct vring_virtqueue *vq) { return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx); } static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); void *ret; unsigned int i; u16 last_used; START_USE(vq); if (unlikely(vq->broken)) { END_USE(vq); return NULL; } if (!more_used_split(vq)) { pr_debug("No more buffers in queue\n"); END_USE(vq); return NULL; } /* Only get used array entries after they have been exposed by host. */ virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); i = virtio32_to_cpu(_vq->vdev, vq->split.vring.used->ring[last_used].id); *len = virtio32_to_cpu(_vq->vdev, vq->split.vring.used->ring[last_used].len); if (unlikely(i >= vq->split.vring.num)) { BAD_RING(vq, "id %u out of range\n", i); return NULL; } if (unlikely(!vq->split.desc_state[i].data)) { BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; detach_buf_split(vq, i, ctx); vq->last_used_idx++; /* If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before * the read in the next get_buf call. */ if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); END_USE(vq); return ret; } static void virtqueue_disable_cb_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; /* * If device triggered an event already it won't trigger one again: * no need to disable. */ if (vq->event_triggered) return; if (vq->event) /* TODO: this is a hack. Figure out a cleaner value to write. */ vring_used_event(&vq->split.vring) = 0x0; else vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } } static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used_idx; START_USE(vq); /* We optimistically turn back on interrupts, then check if there was * more to do. */ /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always do both to keep code simple. */ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx); END_USE(vq); return last_used_idx; } static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx) { struct vring_virtqueue *vq = to_vvq(_vq); return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx); } static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 bufs; START_USE(vq); /* We optimistically turn back on interrupts, then check if there was * more to do. */ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always update the event index to keep code simple. */ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } /* TODO: tune this threshold */ bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4; virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx) - vq->last_used_idx) > bufs)) { END_USE(vq); return false; } END_USE(vq); return true; } static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; START_USE(vq); for (i = 0; i < vq->split.vring.num; i++) { if (!vq->split.desc_state[i].data) continue; /* detach_buf_split clears data, so grab it now. */ buf = vq->split.desc_state[i].data; detach_buf_split(vq, i, NULL); vq->split.avail_idx_shadow--; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); END_USE(vq); return buf; } /* That should have freed everything. */ BUG_ON(vq->vq.num_free != vq->split.vring.num); END_USE(vq); return NULL; } static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split, struct vring_virtqueue *vq) { struct virtio_device *vdev; vdev = vq->vq.vdev; vring_split->avail_flags_shadow = 0; vring_split->avail_idx_shadow = 0; /* No callback? Tell other side not to bother us. */ if (!vq->vq.callback) { vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vring_split->vring.avail->flags = cpu_to_virtio16(vdev, vring_split->avail_flags_shadow); } } static void virtqueue_reinit_split(struct vring_virtqueue *vq) { int num; num = vq->split.vring.num; vq->split.vring.avail->flags = 0; vq->split.vring.avail->idx = 0; /* reset avail event */ vq->split.vring.avail->ring[num] = 0; vq->split.vring.used->flags = 0; vq->split.vring.used->idx = 0; /* reset used event */ *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0; virtqueue_init(vq, num); virtqueue_vring_init_split(&vq->split, vq); } static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, struct vring_virtqueue_split *vring_split) { vq->split = *vring_split; /* Put everything in free lists. */ vq->free_head = 0; } static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) { struct vring_desc_state_split *state; struct vring_desc_extra *extra; u32 num = vring_split->vring.num; state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL); if (!state) goto err_state; extra = vring_alloc_desc_extra(num); if (!extra) goto err_extra; memset(state, 0, num * sizeof(struct vring_desc_state_split)); vring_split->desc_state = state; vring_split->desc_extra = extra; return 0; err_extra: kfree(state); err_state: return -ENOMEM; } static void vring_free_split(struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, struct device *dma_dev) { vring_free_queue(vdev, vring_split->queue_size_in_bytes, vring_split->vring.desc, vring_split->queue_dma_addr, dma_dev); kfree(vring_split->desc_state); kfree(vring_split->desc_extra); } static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, u32 num, unsigned int vring_align, bool may_reduce_num, struct device *dma_dev) { void *queue = NULL; dma_addr_t dma_addr; /* We assume num is a power of 2. */ if (!is_power_of_2(num)) { dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); return -EINVAL; } /* TODO: allocate each queue chunk individually */ for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (queue) break; if (!may_reduce_num) return -ENOMEM; } if (!num) return -ENOMEM; if (!queue) { /* Try to get a single page. You are my only hope! */ queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_ZERO, dma_dev); } if (!queue) return -ENOMEM; vring_init(&vring_split->vring, num, queue, vring_align); vring_split->queue_dma_addr = dma_addr; vring_split->queue_size_in_bytes = vring_size(num, vring_align); vring_split->vring_align = vring_align; vring_split->may_reduce_num = may_reduce_num; return 0; } static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, bool weak_barriers, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue *vq; int err; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) return NULL; vq->packed_ring = false; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; vq->vq.index = index; vq->vq.reset = false; vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; #else vq->broken = false; #endif vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; err = vring_alloc_state_extra_split(vring_split); if (err) { kfree(vq); return NULL; } virtqueue_vring_init_split(vring_split, vq); virtqueue_init(vq, vring_split->vring.num); virtqueue_vring_attach_split(vq, vring_split); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; } static struct virtqueue *vring_create_virtqueue_split( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue_split vring_split = {}; struct virtqueue *vq; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, may_reduce_num, dma_dev); if (err) return NULL; vq = __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, dma_dev); if (!vq) { vring_free_split(&vring_split, vdev, dma_dev); return NULL; } to_vvq(vq)->we_own_ring = true; return vq; } static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) { struct vring_virtqueue_split vring_split = {}; struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = _vq->vdev; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vq->split.vring_align, vq->split.may_reduce_num, vring_dma_dev(vq)); if (err) goto err; err = vring_alloc_state_extra_split(&vring_split); if (err) goto err_state_extra; vring_free(&vq->vq); virtqueue_vring_init_split(&vring_split, vq); virtqueue_init(vq, vring_split.vring.num); virtqueue_vring_attach_split(vq, &vring_split); return 0; err_state_extra: vring_free_split(&vring_split, vdev, vring_dma_dev(vq)); err: virtqueue_reinit_split(vq); return -ENOMEM; } /* * Packed ring specific functions - *_packed(). */ static bool packed_used_wrap_counter(u16 last_used_idx) { return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } static u16 packed_last_used(u16 last_used_idx) { return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, const struct vring_desc_extra *extra) { u16 flags; flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { if (!vq->use_dma_api) return; dma_unmap_single(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { if (!vring_need_unmap_buffer(vq, extra)) return; dma_unmap_page(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } } static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, gfp_t gfp) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; int i, size; /* * We require lowmem mappings for the descriptors because * otherwise virt_to_phys will give us bogus addresses in the * virtqueue. */ gfp &= ~__GFP_HIGHMEM; size = (sizeof(*desc) + sizeof(*extra)) * total_sg; desc = kmalloc(size, gfp); if (!desc) return NULL; extra = (struct vring_desc_extra *)&desc[total_sg]; for (i = 0; i < total_sg; i++) extra[i].next = i + 1; return desc; } static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, bool premapped, gfp_t gfp) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, err_idx, len; u16 head, id; dma_addr_t addr; head = vq->packed.next_avail_idx; desc = alloc_indirect_packed(total_sg, gfp); if (!desc) return -ENOMEM; extra = (struct vring_desc_extra *)&desc[total_sg]; if (unlikely(vq->vq.num_free < 1)) { pr_debug("Can't add buf len 1 - avail = 0\n"); kfree(desc); END_USE(vq); return -ENOSPC; } i = 0; id = vq->free_head; BUG_ON(id == vq->packed.vring.num); for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr, &len, premapped)) goto unmap_release; desc[i].flags = cpu_to_le16(n < out_sgs ? 0 : VRING_DESC_F_WRITE); desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(len); if (unlikely(vq->use_dma_api)) { extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; } i++; } } /* Now that the indirect table is filled in, map it. */ addr = vring_map_single(vq, desc, total_sg * sizeof(struct vring_packed_desc), DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) goto unmap_release; vq->packed.vring.desc[head].addr = cpu_to_le64(addr); vq->packed.vring.desc[head].len = cpu_to_le32(total_sg * sizeof(struct vring_packed_desc)); vq->packed.vring.desc[head].id = cpu_to_le16(id); if (vq->use_dma_api) { vq->packed.desc_extra[id].addr = addr; vq->packed.desc_extra[id].len = total_sg * sizeof(struct vring_packed_desc); vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT | vq->packed.avail_used_flags; } /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising * the list are made available. */ virtio_wmb(vq->weak_barriers); vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT | vq->packed.avail_used_flags); /* We're using some buffers from the free list. */ vq->vq.num_free -= 1; /* Update free pointer */ n = head + 1; if (n >= vq->packed.vring.num) { n = 0; vq->packed.avail_wrap_counter ^= 1; vq->packed.avail_used_flags ^= 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; vq->num_added += 1; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); return 0; unmap_release: err_idx = i; for (i = 0; i < err_idx; i++) vring_unmap_extra_packed(vq, &extra[i]); kfree(desc); END_USE(vq); return -ENOMEM; } static inline int virtqueue_add_packed(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, c, descs_used, err_idx, len; __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; START_USE(vq); BUG_ON(data == NULL); BUG_ON(ctx && vq->indirect); if (unlikely(vq->broken)) { END_USE(vq); return -EIO; } LAST_ADD_TIME_UPDATE(vq); BUG_ON(total_sg == 0); if (virtqueue_use_indirect(vq, total_sg)) { err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, in_sgs, data, premapped, gfp); if (err != -ENOMEM) { END_USE(vq); return err; } /* fall back on direct */ } head = vq->packed.next_avail_idx; avail_used_flags = vq->packed.avail_used_flags; WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect); desc = vq->packed.vring.desc; i = head; descs_used = total_sg; if (unlikely(vq->vq.num_free < descs_used)) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); END_USE(vq); return -ENOSPC; } id = vq->free_head; BUG_ON(id == vq->packed.vring.num); curr = id; c = 0; for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr, &len, premapped)) goto unmap_release; flags = cpu_to_le16(vq->packed.avail_used_flags | (++c == total_sg ? 0 : VRING_DESC_F_NEXT) | (n < out_sgs ? 0 : VRING_DESC_F_WRITE)); if (i == head) head_flags = flags; else desc[i].flags = flags; desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(len); desc[i].id = cpu_to_le16(id); if (unlikely(vq->use_dma_api)) { vq->packed.desc_extra[curr].addr = premapped ? DMA_MAPPING_ERROR : addr; vq->packed.desc_extra[curr].len = len; vq->packed.desc_extra[curr].flags = le16_to_cpu(flags); } prev = curr; curr = vq->packed.desc_extra[curr].next; if ((unlikely(++i >= vq->packed.vring.num))) { i = 0; vq->packed.avail_used_flags ^= 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } } } if (i <= head) vq->packed.avail_wrap_counter ^= 1; /* We're using some buffers from the free list. */ vq->vq.num_free -= descs_used; /* Update free pointer */ vq->packed.next_avail_idx = i; vq->free_head = curr; /* Store token. */ vq->packed.desc_state[id].num = descs_used; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = ctx; vq->packed.desc_state[id].last = prev; /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising * the list are made available. */ virtio_wmb(vq->weak_barriers); vq->packed.vring.desc[head].flags = head_flags; vq->num_added += descs_used; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); return 0; unmap_release: err_idx = i; i = head; curr = vq->free_head; vq->packed.avail_used_flags = avail_used_flags; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]); curr = vq->packed.desc_extra[curr].next; i++; if (i >= vq->packed.vring.num) i = 0; } END_USE(vq); return -EIO; } static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old, off_wrap, flags, wrap_counter, event_idx; bool needs_kick; union { struct { __le16 off_wrap; __le16 flags; }; u32 u32; } snapshot; START_USE(vq); /* * We need to expose the new flags value before checking notification * suppressions. */ virtio_mb(vq->weak_barriers); old = vq->packed.next_avail_idx - vq->num_added; new = vq->packed.next_avail_idx; vq->num_added = 0; snapshot.u32 = *(u32 *)vq->packed.vring.device; flags = le16_to_cpu(snapshot.flags); LAST_ADD_TIME_CHECK(vq); LAST_ADD_TIME_INVALID(vq); if (flags != VRING_PACKED_EVENT_FLAG_DESC) { needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE); goto out; } off_wrap = le16_to_cpu(snapshot.off_wrap); wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); if (wrap_counter != vq->packed.avail_wrap_counter) event_idx -= vq->packed.vring.num; needs_kick = vring_need_event(event_idx, new, old); out: END_USE(vq); return needs_kick; } static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int id, void **ctx) { struct vring_desc_state_packed *state = NULL; struct vring_packed_desc *desc; unsigned int i, curr; state = &vq->packed.desc_state[id]; /* Clear data ptr. */ state->data = NULL; vq->packed.desc_extra[state->last].next = vq->free_head; vq->free_head = id; vq->vq.num_free += state->num; if (unlikely(vq->use_dma_api)) { curr = id; for (i = 0; i < state->num; i++) { vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]); curr = vq->packed.desc_extra[curr].next; } } if (vq->indirect) { struct vring_desc_extra *extra; u32 len, num; /* Free the indirect table, if any, now that it's unmapped. */ desc = state->indir_desc; if (!desc) return; if (vq->use_dma_api) { len = vq->packed.desc_extra[id].len; num = len / sizeof(struct vring_packed_desc); extra = (struct vring_desc_extra *)&desc[num]; for (i = 0; i < num; i++) vring_unmap_extra_packed(vq, &extra[i]); } kfree(desc); state->indir_desc = NULL; } else if (ctx) { *ctx = state->indir_desc; } } static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, u16 idx, bool used_wrap_counter) { bool avail, used; u16 flags; flags = le16_to_cpu(vq->packed.vring.desc[idx].flags); avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); used = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); return avail == used && used == used_wrap_counter; } static bool more_used_packed(const struct vring_virtqueue *vq) { u16 last_used; u16 last_used_idx; bool used_wrap_counter; last_used_idx = READ_ONCE(vq->last_used_idx); last_used = packed_last_used(last_used_idx); used_wrap_counter = packed_used_wrap_counter(last_used_idx); return is_used_desc_packed(vq, last_used, used_wrap_counter); } static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used, id, last_used_idx; bool used_wrap_counter; void *ret; START_USE(vq); if (unlikely(vq->broken)) { END_USE(vq); return NULL; } if (!more_used_packed(vq)) { pr_debug("No more buffers in queue\n"); END_USE(vq); return NULL; } /* Only get used elements after they have been exposed by host. */ virtio_rmb(vq->weak_barriers); last_used_idx = READ_ONCE(vq->last_used_idx); used_wrap_counter = packed_used_wrap_counter(last_used_idx); last_used = packed_last_used(last_used_idx); id = le16_to_cpu(vq->packed.vring.desc[last_used].id); *len = le32_to_cpu(vq->packed.vring.desc[last_used].len); if (unlikely(id >= vq->packed.vring.num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } if (unlikely(!vq->packed.desc_state[id].data)) { BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } /* detach_buf_packed clears data, so grab it now. */ ret = vq->packed.desc_state[id].data; detach_buf_packed(vq, id, ctx); last_used += vq->packed.desc_state[id].num; if (unlikely(last_used >= vq->packed.vring.num)) { last_used -= vq->packed.vring.num; used_wrap_counter ^= 1; } last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); WRITE_ONCE(vq->last_used_idx, last_used); /* * If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before * the read in the next get_buf call. */ if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) virtio_store_mb(vq->weak_barriers, &vq->packed.vring.driver->off_wrap, cpu_to_le16(vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); END_USE(vq); return ret; } static void virtqueue_disable_cb_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; /* * If device triggered an event already it won't trigger one again: * no need to disable. */ if (vq->event_triggered) return; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } } static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); START_USE(vq); /* * We optimistically turn back on interrupts, then check if there was * more to do. */ if (vq->event) { vq->packed.vring.driver->off_wrap = cpu_to_le16(vq->last_used_idx); /* * We need to update event offset and event wrap * counter first before updating event flags. */ virtio_wmb(vq->weak_barriers); } if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = vq->event ? VRING_PACKED_EVENT_FLAG_DESC : VRING_PACKED_EVENT_FLAG_ENABLE; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } END_USE(vq); return vq->last_used_idx; } static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap) { struct vring_virtqueue *vq = to_vvq(_vq); bool wrap_counter; u16 used_idx; wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); return is_used_desc_packed(vq, used_idx, wrap_counter); } static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 used_idx, wrap_counter, last_used_idx; u16 bufs; START_USE(vq); /* * We optimistically turn back on interrupts, then check if there was * more to do. */ if (vq->event) { /* TODO: tune this threshold */ bufs = (vq->packed.vring.num - vq->vq.num_free) * 3 / 4; last_used_idx = READ_ONCE(vq->last_used_idx); wrap_counter = packed_used_wrap_counter(last_used_idx); used_idx = packed_last_used(last_used_idx) + bufs; if (used_idx >= vq->packed.vring.num) { used_idx -= vq->packed.vring.num; wrap_counter ^= 1; } vq->packed.vring.driver->off_wrap = cpu_to_le16(used_idx | (wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); /* * We need to update event offset and event wrap * counter first before updating event flags. */ virtio_wmb(vq->weak_barriers); } if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = vq->event ? VRING_PACKED_EVENT_FLAG_DESC : VRING_PACKED_EVENT_FLAG_ENABLE; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } /* * We need to update event suppression structure first * before re-checking for more used buffers. */ virtio_mb(vq->weak_barriers); last_used_idx = READ_ONCE(vq->last_used_idx); wrap_counter = packed_used_wrap_counter(last_used_idx); used_idx = packed_last_used(last_used_idx); if (is_used_desc_packed(vq, used_idx, wrap_counter)) { END_USE(vq); return false; } END_USE(vq); return true; } static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; START_USE(vq); for (i = 0; i < vq->packed.vring.num; i++) { if (!vq->packed.desc_state[i].data) continue; /* detach_buf clears data, so grab it now. */ buf = vq->packed.desc_state[i].data; detach_buf_packed(vq, i, NULL); END_USE(vq); return buf; } /* That should have freed everything. */ BUG_ON(vq->vq.num_free != vq->packed.vring.num); END_USE(vq); return NULL; } static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) { struct vring_desc_extra *desc_extra; unsigned int i; desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), GFP_KERNEL); if (!desc_extra) return NULL; memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); for (i = 0; i < num - 1; i++) desc_extra[i].next = i + 1; return desc_extra; } static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, struct device *dma_dev) { if (vring_packed->vring.desc) vring_free_queue(vdev, vring_packed->ring_size_in_bytes, vring_packed->vring.desc, vring_packed->ring_dma_addr, dma_dev); if (vring_packed->vring.driver) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.driver, vring_packed->driver_event_dma_addr, dma_dev); if (vring_packed->vring.device) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.device, vring_packed->device_event_dma_addr, dma_dev); kfree(vring_packed->desc_state); kfree(vring_packed->desc_extra); } static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, u32 num, struct device *dma_dev) { struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; ring_size_in_bytes = num * sizeof(struct vring_packed_desc); ring = vring_alloc_queue(vdev, ring_size_in_bytes, &ring_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!ring) goto err; vring_packed->vring.desc = ring; vring_packed->ring_dma_addr = ring_dma_addr; vring_packed->ring_size_in_bytes = ring_size_in_bytes; event_size_in_bytes = sizeof(struct vring_packed_desc_event); driver = vring_alloc_queue(vdev, event_size_in_bytes, &driver_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!driver) goto err; vring_packed->vring.driver = driver; vring_packed->event_size_in_bytes = event_size_in_bytes; vring_packed->driver_event_dma_addr = driver_event_dma_addr; device = vring_alloc_queue(vdev, event_size_in_bytes, &device_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!device) goto err; vring_packed->vring.device = device; vring_packed->device_event_dma_addr = device_event_dma_addr; vring_packed->vring.num = num; return 0; err: vring_free_packed(vring_packed, vdev, dma_dev); return -ENOMEM; } static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed) { struct vring_desc_state_packed *state; struct vring_desc_extra *extra; u32 num = vring_packed->vring.num; state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL); if (!state) goto err_desc_state; memset(state, 0, num * sizeof(struct vring_desc_state_packed)); extra = vring_alloc_desc_extra(num); if (!extra) goto err_desc_extra; vring_packed->desc_state = state; vring_packed->desc_extra = extra; return 0; err_desc_extra: kfree(state); err_desc_state: return -ENOMEM; } static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed, bool callback) { vring_packed->next_avail_idx = 0; vring_packed->avail_wrap_counter = 1; vring_packed->event_flags_shadow = 0; vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; /* No callback? Tell other side not to bother us. */ if (!callback) { vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; vring_packed->vring.driver->flags = cpu_to_le16(vring_packed->event_flags_shadow); } } static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, struct vring_virtqueue_packed *vring_packed) { vq->packed = *vring_packed; /* Put everything in free lists. */ vq->free_head = 0; } static void virtqueue_reinit_packed(struct vring_virtqueue *vq) { memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); virtqueue_init(vq, vq->packed.vring.num); virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); } static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, bool weak_barriers, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue *vq; int err; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) return NULL; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; vq->vq.index = index; vq->vq.reset = false; vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; #else vq->broken = false; #endif vq->packed_ring = true; vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; err = vring_alloc_state_extra_packed(vring_packed); if (err) { kfree(vq); return NULL; } virtqueue_vring_init_packed(vring_packed, !!callback); virtqueue_init(vq, vring_packed->vring.num); virtqueue_vring_attach_packed(vq, vring_packed); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; } static struct virtqueue *vring_create_virtqueue_packed( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue_packed vring_packed = {}; struct virtqueue *vq; if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev)) return NULL; vq = __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, context, notify, callback, name, dma_dev); if (!vq) { vring_free_packed(&vring_packed, vdev, dma_dev); return NULL; } to_vvq(vq)->we_own_ring = true; return vq; } static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) { struct vring_virtqueue_packed vring_packed = {}; struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = _vq->vdev; int err; if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq))) goto err_ring; err = vring_alloc_state_extra_packed(&vring_packed); if (err) goto err_state_extra; vring_free(&vq->vq); virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback); virtqueue_init(vq, vring_packed.vring.num); virtqueue_vring_attach_packed(vq, &vring_packed); return 0; err_state_extra: vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq)); err_ring: virtqueue_reinit_packed(vq); return -ENOMEM; } static int virtqueue_disable_and_recycle(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = vq->vq.vdev; void *buf; int err; if (!vq->we_own_ring) return -EPERM; if (!vdev->config->disable_vq_and_reset) return -ENOENT; if (!vdev->config->enable_vq_after_reset) return -ENOENT; err = vdev->config->disable_vq_and_reset(_vq); if (err) return err; while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL) recycle(_vq, buf); return 0; } static int virtqueue_enable_after_reset(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = vq->vq.vdev; if (vdev->config->enable_vq_after_reset(_vq)) return -EBUSY; return 0; } /* * Generic functions and exported symbols. */ static inline int virtqueue_add(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, bool premapped, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, premapped, gfp) : virtqueue_add_split(_vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, premapped, gfp); } /** * virtqueue_add_sgs - expose buffers to other end * @_vq: the struct virtqueue we're talking about. * @sgs: array of terminated scatterlists. * @out_sgs: the number of scatterlists readable by other side * @in_sgs: the number of scatterlists which are writable (after readable ones) * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_sgs(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int out_sgs, unsigned int in_sgs, void *data, gfp_t gfp) { unsigned int i, total_sg = 0; /* Count them first. */ for (i = 0; i < out_sgs + in_sgs; i++) { struct scatterlist *sg; for (sg = sgs[i]; sg; sg = sg_next(sg)) total_sg++; } return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); /** * virtqueue_add_outbuf - expose output buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg readable by other side * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_outbuf(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); /** * virtqueue_add_outbuf_premapped - expose output buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg readable by other side * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Return: * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_outbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf_premapped); /** * virtqueue_add_inbuf - expose input buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg writable by other side * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); /** * virtqueue_add_inbuf_ctx - expose input buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg writable by other side * @data: the token identifying the buffer. * @ctx: extra context for the token * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, void *ctx, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); /** * virtqueue_add_inbuf_premapped - expose input buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg writable by other side * @data: the token identifying the buffer. * @ctx: extra context for the token * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Return: * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf_premapped(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, void *ctx, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_premapped); /** * virtqueue_dma_dev - get the dma dev * @_vq: the struct virtqueue we're talking about. * * Returns the dma dev. That can been used for dma api. */ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->use_dma_api) return vring_dma_dev(vq); else return NULL; } EXPORT_SYMBOL_GPL(virtqueue_dma_dev); /** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @_vq: the struct virtqueue * * Instead of virtqueue_kick(), you can do: * if (virtqueue_kick_prepare(vq)) * virtqueue_notify(vq); * * This is sometimes useful because the virtqueue_kick_prepare() needs * to be serialized, but the actual virtqueue_notify() call does not. */ bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) : virtqueue_kick_prepare_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); /** * virtqueue_notify - second half of split virtqueue_kick call. * @_vq: the struct virtqueue * * This does not need to be serialized. * * Returns false if host notify failed or queue is broken, otherwise true. */ bool virtqueue_notify(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (unlikely(vq->broken)) return false; /* Prod other side to tell it about changes. */ if (!vq->notify(_vq)) { vq->broken = true; return false; } return true; } EXPORT_SYMBOL_GPL(virtqueue_notify); /** * virtqueue_kick - update after add_buf * @vq: the struct virtqueue * * After one or more virtqueue_add_* calls, invoke this to kick * the other side. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). * * Returns false if kick failed, otherwise true. */ bool virtqueue_kick(struct virtqueue *vq) { if (virtqueue_kick_prepare(vq)) return virtqueue_notify(vq); return true; } EXPORT_SYMBOL_GPL(virtqueue_kick); /** * virtqueue_get_buf_ctx - get the next used buffer * @_vq: the struct virtqueue we're talking about. * @len: the length written into the buffer * @ctx: extra context for the token * * If the device wrote data into the buffer, @len will be set to the * amount written. This means you don't need to clear the buffer * beforehand to ensure there's no data leakage in the case of short * writes. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). * * Returns NULL if there are no used buffers, or the "data" token * handed to virtqueue_add_*(). */ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) : virtqueue_get_buf_ctx_split(_vq, len, ctx); } EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) { return virtqueue_get_buf_ctx(_vq, len, NULL); } EXPORT_SYMBOL_GPL(virtqueue_get_buf); /** * virtqueue_disable_cb - disable callbacks * @_vq: the struct virtqueue we're talking about. * * Note that this is not necessarily synchronous, hence unreliable and only * useful as an optimization. * * Unlike other operations, this need not be serialized. */ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->packed_ring) virtqueue_disable_cb_packed(_vq); else virtqueue_disable_cb_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); /** * virtqueue_enable_cb_prepare - restart callbacks after disable_cb * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks; it returns current queue state * in an opaque unsigned value. This value should be later tested by * virtqueue_poll, to detect a possible race between the driver checking for * more work, and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) vq->event_triggered = false; return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : virtqueue_enable_cb_prepare_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); /** * virtqueue_poll - query pending used buffers * @_vq: the struct virtqueue we're talking about. * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare). * * Returns "true" if there are pending used buffers in the queue. * * This does not need to be serialized. */ bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) { struct vring_virtqueue *vq = to_vvq(_vq); if (unlikely(vq->broken)) return false; virtio_mb(vq->weak_barriers); return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) : virtqueue_poll_split(_vq, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_poll); /** * virtqueue_enable_cb - restart callbacks after disable_cb. * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks; it returns "false" if there are pending * buffers in the queue, to detect a possible race between the driver * checking for more work, and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ bool virtqueue_enable_cb(struct virtqueue *_vq) { unsigned int last_used_idx = virtqueue_enable_cb_prepare(_vq); return !virtqueue_poll(_vq, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb); /** * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks but hints to the other side to delay * interrupts until most of the available buffers have been processed; * it returns "false" if there are many pending buffers in the queue, * to detect a possible race between the driver checking for more work, * and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) vq->event_triggered = false; return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); /** * virtqueue_detach_unused_buf - detach first unused buffer * @_vq: the struct virtqueue we're talking about. * * Returns NULL or the "data" token handed to virtqueue_add_*(). * This is not valid on an active queue; it is useful for device * shutdown or the reset queue. */ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) : virtqueue_detach_unused_buf_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); static inline bool more_used(const struct vring_virtqueue *vq) { return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq); } /** * vring_interrupt - notify a virtqueue on an interrupt * @irq: the IRQ number (ignored) * @_vq: the struct virtqueue to notify * * Calls the callback function of @_vq to process the virtqueue * notification. */ irqreturn_t vring_interrupt(int irq, void *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (!more_used(vq)) { pr_debug("virtqueue interrupt with no work for %p\n", vq); return IRQ_NONE; } if (unlikely(vq->broken)) { #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION dev_warn_once(&vq->vq.vdev->dev, "virtio vring IRQ raised before DRIVER_OK"); return IRQ_NONE; #else return IRQ_HANDLED; #endif } /* Just a hint for performance: so it's ok that this can be racy! */ if (vq->event) data_race(vq->event_triggered = true); pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); return IRQ_HANDLED; } EXPORT_SYMBOL_GPL(vring_interrupt); struct virtqueue *vring_create_virtqueue( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, vdev->dev.parent); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, vdev->dev.parent); } EXPORT_SYMBOL_GPL(vring_create_virtqueue); struct virtqueue *vring_create_virtqueue_dma( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, dma_dev); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, dma_dev); } EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); /** * virtqueue_resize - resize the vring of vq * @_vq: the struct virtqueue we're talking about. * @num: new ring num * @recycle: callback to recycle unused buffers * @recycle_done: callback to be invoked when recycle for all unused buffers done * * When it is really necessary to create a new vring, it will set the current vq * into the reset state. Then call the passed callback to recycle the buffer * that is no longer used. Only after the new vring is successfully created, the * old vring will be released. * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error. * 0: success. * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size. * vq can still work normally * -EBUSY: Failed to sync with device, vq may not work properly * -ENOENT: Transport or device not supported * -E2BIG/-EINVAL: num error * -EPERM: Operation not permitted * */ int virtqueue_resize(struct virtqueue *_vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf), void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; if (num > vq->vq.num_max) return -E2BIG; if (!num) return -EINVAL; if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) return 0; err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; if (recycle_done) recycle_done(_vq); if (vq->packed_ring) err = virtqueue_resize_packed(_vq, num); else err = virtqueue_resize_split(_vq, num); return virtqueue_enable_after_reset(_vq); } EXPORT_SYMBOL_GPL(virtqueue_resize); /** * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. * @recycle: callback to recycle unused buffers * @recycle_done: callback to be invoked when recycle for all unused buffers done * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error. * 0: success. * -EBUSY: Failed to sync with device, vq may not work properly * -ENOENT: Transport or device not supported * -EPERM: Operation not permitted */ int virtqueue_reset(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf), void (*recycle_done)(struct virtqueue *vq)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; if (recycle_done) recycle_done(_vq); if (vq->packed_ring) virtqueue_reinit_packed(vq); else virtqueue_reinit_split(vq); return virtqueue_enable_after_reset(_vq); } EXPORT_SYMBOL_GPL(virtqueue_reset); struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool context, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name) { struct vring_virtqueue_split vring_split = {}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { struct vring_virtqueue_packed vring_packed = {}; vring_packed.vring.num = num; vring_packed.vring.desc = pages; return __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, context, notify, callback, name, vdev->dev.parent); } vring_init(&vring_split.vring, num, pages, vring_align); return __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, vdev->dev.parent); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); static void vring_free(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->we_own_ring) { if (vq->packed_ring) { vring_free_queue(vq->vq.vdev, vq->packed.ring_size_in_bytes, vq->packed.vring.desc, vq->packed.ring_dma_addr, vring_dma_dev(vq)); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.driver, vq->packed.driver_event_dma_addr, vring_dma_dev(vq)); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.device, vq->packed.device_event_dma_addr, vring_dma_dev(vq)); kfree(vq->packed.desc_state); kfree(vq->packed.desc_extra); } else { vring_free_queue(vq->vq.vdev, vq->split.queue_size_in_bytes, vq->split.vring.desc, vq->split.queue_dma_addr, vring_dma_dev(vq)); } } if (!vq->packed_ring) { kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } } void vring_del_virtqueue(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); spin_lock(&vq->vq.vdev->vqs_list_lock); list_del(&_vq->list); spin_unlock(&vq->vq.vdev->vqs_list_lock); vring_free(_vq); kfree(vq); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); u32 vring_notification_data(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 next; if (vq->packed_ring) next = (vq->packed.next_avail_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | vq->packed.avail_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR; else next = vq->split.avail_idx_shadow; return next << 16 | _vq->index; } EXPORT_SYMBOL_GPL(vring_notification_data); /* Manipulates transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev) { unsigned int i; for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { switch (i) { case VIRTIO_RING_F_INDIRECT_DESC: break; case VIRTIO_RING_F_EVENT_IDX: break; case VIRTIO_F_VERSION_1: break; case VIRTIO_F_ACCESS_PLATFORM: break; case VIRTIO_F_RING_PACKED: break; case VIRTIO_F_ORDER_PLATFORM: break; case VIRTIO_F_NOTIFICATION_DATA: break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); } } } EXPORT_SYMBOL_GPL(vring_transport_features); /** * virtqueue_get_vring_size - return the size of the virtqueue's vring * @_vq: the struct virtqueue containing the vring of interest. * * Returns the size of the vring. This is mainly used for boasting to * userspace. Unlike other operations, this need not be serialized. */ unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); /* * This function should only be called by the core, not directly by the driver. */ void __virtqueue_break(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } EXPORT_SYMBOL_GPL(__virtqueue_break); /* * This function should only be called by the core, not directly by the driver. */ void __virtqueue_unbreak(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, false); } EXPORT_SYMBOL_GPL(__virtqueue_unbreak); bool virtqueue_is_broken(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); return READ_ONCE(vq->broken); } EXPORT_SYMBOL_GPL(virtqueue_is_broken); /* * This should prevent the device from being used, allowing drivers to * recover. You may need to grab appropriate locks to flush. */ void virtio_break_device(struct virtio_device *dev) { struct virtqueue *_vq; spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(virtio_break_device); /* * This should allow the device to be used by the driver. You may * need to grab appropriate locks to flush the write to * vq->broken. This should only be used in some specific case e.g * (probing and restoring). This function should only be called by the * core, not directly by the driver. */ void __virtio_unbreak_device(struct virtio_device *dev) { struct virtqueue *_vq; spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, false); } spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(__virtio_unbreak_device); dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.ring_dma_addr; return vq->split.queue_dma_addr; } EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.driver_event_dma_addr; return vq->split.queue_dma_addr + ((char *)vq->split.vring.avail - (char *)vq->split.vring.desc); } EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.device_event_dma_addr; return vq->split.queue_dma_addr + ((char *)vq->split.vring.used - (char *)vq->split.vring.desc); } EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); /* Only available for split ring */ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) { return &to_vvq(vq)->split.vring; } EXPORT_SYMBOL_GPL(virtqueue_get_vring); /** * virtqueue_dma_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. * @ptr: the pointer of the buffer to do dma * @size: the size of the buffer to do dma * @dir: DMA direction * @attrs: DMA Attrs * * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). */ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); /** * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq * @_vq: the struct virtqueue we're talking about. * @addr: the dma address to unmap * @size: the size of the buffer * @dir: DMA direction * @attrs: DMA Attrs * * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. * */ void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return; dma_unmap_single_attrs(vring_dma_dev(vq), addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); /** * virtqueue_dma_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Returns 0 means dma valid. Other means invalid dma address. */ int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); /** * virtqueue_dma_need_sync - check a dma address needs sync * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be * synchronized * * return bool */ bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return false; return dma_need_sync(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); /** * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized * */ void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) return; dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); /** * virtqueue_dma_sync_single_range_for_device - dma sync for device * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized */ void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) return; dma_sync_single_range_for_device(dev, addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); MODULE_DESCRIPTION("Virtio ring implementation"); MODULE_LICENSE("GPL"); |
1 10 1 9 9 9 9 9 4 9 3 2 1 1 1 5 4 4 4 4 4 1 1 2 4 2 7 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> #include <net/inet_sock.h> #include <net/raw.h> #include <net/rawv6.h> #ifdef pr_fmt # undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt static struct raw_hashinfo * raw_get_hashinfo(const struct inet_diag_req_v2 *r) { if (r->sdiag_family == AF_INET) { return &raw_v4_hashinfo; #if IS_ENABLED(CONFIG_IPV6) } else if (r->sdiag_family == AF_INET6) { return &raw_v6_hashinfo; #endif } else { return ERR_PTR(-EINVAL); } } /* * Due to requirement of not breaking user API we can't simply * rename @pad field in inet_diag_req_v2 structure, instead * use helper to figure it out. */ static bool raw_lookup(struct net *net, const struct sock *sk, const struct inet_diag_req_v2 *req) { struct inet_diag_req_raw *r = (void *)req; if (r->sdiag_family == AF_INET) return raw_v4_match(net, sk, r->sdiag_raw_protocol, r->id.idiag_dst[0], r->id.idiag_src[0], r->id.idiag_if, 0); #if IS_ENABLED(CONFIG_IPV6) else return raw_v6_match(net, sk, r->sdiag_raw_protocol, (const struct in6_addr *)r->id.idiag_src, (const struct in6_addr *)r->id.idiag_dst, r->id.idiag_if, 0); #endif return false; } static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct hlist_head *hlist; struct sock *sk; int slot; if (IS_ERR(hashinfo)) return ERR_CAST(hashinfo); rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill * diag message to be reported, so * caller should call sock_put then. */ if (refcount_inc_not_zero(&sk->sk_refcnt)) goto out_unlock; } } } sk = ERR_PTR(-ENOENT); out_unlock: rcu_read_unlock(); return sk; } static int raw_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct sk_buff *in_skb = cb->skb; struct sk_buff *rep; struct sock *sk; struct net *net; int err; net = sock_net(in_skb->sk); sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) { sock_put(sk); return -ENOMEM; } err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); sock_put(sk); if (err < 0) { kfree_skb(rep); return err; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); return err; } static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); } static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; if (IS_ERR(hashinfo)) return; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; rcu_read_lock(); for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) { num = 0; hlist = &hashinfo->ht[slot]; sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) goto out_unlock; next: num++; } } out_unlock: rcu_read_unlock(); cb->args[0] = slot; cb->args[1] = num; } static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int raw_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *r) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; sk = raw_sock_get(net, r); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } #endif static const struct inet_diag_handler raw_diag_handler = { .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, .idiag_type = IPPROTO_RAW, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = raw_diag_destroy, #endif }; static void __always_unused __check_inet_diag_req_raw(void) { /* * Make sure the two structures are identical, * except the @pad field. */ #define __offset_mismatch(m1, m2) \ (offsetof(struct inet_diag_req_v2, m1) != \ offsetof(struct inet_diag_req_raw, m2)) BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) != sizeof(struct inet_diag_req_raw)); BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family)); BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext)); BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol)); BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states)); BUILD_BUG_ON(__offset_mismatch(id, id)); #undef __offset_mismatch } static int __init raw_diag_init(void) { return inet_diag_register(&raw_diag_handler); } static void __exit raw_diag_exit(void) { inet_diag_unregister(&raw_diag_handler); } module_init(raw_diag_init); module_exit(raw_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */); |
5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #ifndef _NET_BATMAN_ADV_MAIN_H_ #define _NET_BATMAN_ADV_MAIN_H_ #define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \ "Simon Wunderlich <sw@simonwunderlich.de>" #define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced" #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION #define BATADV_SOURCE_VERSION "2024.3" #endif /* B.A.T.M.A.N. parameters */ #define BATADV_TQ_MAX_VALUE 255 #define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 /* Time To Live of broadcast messages */ #define BATADV_TTL 50 /* maximum sequence number age of broadcast messages */ #define BATADV_BCAST_MAX_AGE 64 /* purge originators after time in seconds if no valid packet comes in * -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE */ #define BATADV_PURGE_TIMEOUT 200000 /* 200 seconds */ #define BATADV_TT_LOCAL_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_ROAM_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_CLIENT_TEMP_TIMEOUT 600000 /* in milliseconds */ #define BATADV_TT_WORK_PERIOD 5000 /* 5 seconds */ #define BATADV_ORIG_WORK_PERIOD 1000 /* 1 second */ #define BATADV_MCAST_WORK_PERIOD 500 /* 0.5 seconds */ #define BATADV_DAT_ENTRY_TIMEOUT (5 * 60000) /* 5 mins in milliseconds */ /* sliding packet range of received originator messages in sequence numbers * (should be a multiple of our word size) */ #define BATADV_TQ_LOCAL_WINDOW_SIZE 64 /* milliseconds we have to keep pending tt_req */ #define BATADV_TT_REQUEST_TIMEOUT 3000 #define BATADV_TQ_GLOBAL_WINDOW_SIZE 5 #define BATADV_TQ_LOCAL_BIDRECT_SEND_MINIMUM 1 #define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1 #define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1 /* B.A.T.M.A.N. V */ #define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */ #define BATADV_ELP_PROBES_PER_NODE 2 #define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */ #define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */ #define BATADV_ELP_MAX_AGE 64 #define BATADV_OGM_MAX_ORIGDIFF 5 #define BATADV_OGM_MAX_AGE 64 /* number of OGMs sent with the last tt diff */ #define BATADV_TT_OGM_APPEND_MAX 3 /* Time in which a client can roam at most ROAMING_MAX_COUNT times in * milliseconds */ #define BATADV_ROAMING_MAX_TIME 20000 #define BATADV_ROAMING_MAX_COUNT 5 #define BATADV_NO_FLAGS 0 #define BATADV_NULL_IFINDEX 0 /* dummy ifindex used to avoid iface checks */ #define BATADV_NO_MARK 0 /* default interface for multi interface operation. The default interface is * used for communication which originated locally (i.e. is not forwarded) * or where special forwarding is not desired/necessary. */ #define BATADV_IF_DEFAULT ((struct batadv_hard_iface *)NULL) #define BATADV_NUM_WORDS BITS_TO_LONGS(BATADV_TQ_LOCAL_WINDOW_SIZE) #define BATADV_LOG_BUF_LEN 8192 /* has to be a power of 2 */ /* number of packets to send for broadcasts on different interface types */ #define BATADV_NUM_BCASTS_DEFAULT 1 #define BATADV_NUM_BCASTS_WIRELESS 3 /* length of the single packet used by the TP meter */ #define BATADV_TP_PACKET_LEN ETH_DATA_LEN /* msecs after which an ARP_REQUEST is sent in broadcast as fallback */ #define ARP_REQ_DELAY 250 /* numbers of originator to contact for any PUT/GET DHT operation */ #define BATADV_DAT_CANDIDATES_NUM 3 /* BATADV_TQ_SIMILARITY_THRESHOLD - TQ points that a secondary metric can differ * at most from the primary one in order to be still considered acceptable */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 /* should not be bigger than 512 bytes or change the size of * forw_packet->direct_link_flags */ #define BATADV_MAX_AGGREGATION_BYTES 512 #define BATADV_MAX_AGGREGATION_MS 100 #define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */ #define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 #define BATADV_BLA_LOOPDETECT_PERIODS 6 #define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */ #define BATADV_DUPLIST_SIZE 16 #define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */ /* don't reset again within 30 seconds */ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 #define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ #define BATADV_TP_MAX_NUM 5 /** * enum batadv_mesh_state - State of a soft interface */ enum batadv_mesh_state { /** @BATADV_MESH_INACTIVE: soft interface is not yet running */ BATADV_MESH_INACTIVE, /** @BATADV_MESH_ACTIVE: interface is up and running */ BATADV_MESH_ACTIVE, /** @BATADV_MESH_DEACTIVATING: interface is getting shut down */ BATADV_MESH_DEACTIVATING, }; #define BATADV_BCAST_QUEUE_LEN 256 #define BATADV_BATMAN_QUEUE_LEN 256 /** * enum batadv_uev_action - action type of uevent */ enum batadv_uev_action { /** @BATADV_UEV_ADD: gateway was selected (after none was selected) */ BATADV_UEV_ADD = 0, /** * @BATADV_UEV_DEL: selected gateway was removed and none is selected * anymore */ BATADV_UEV_DEL, /** * @BATADV_UEV_CHANGE: a different gateway was selected as based gateway */ BATADV_UEV_CHANGE, /** * @BATADV_UEV_LOOPDETECT: loop was detected which cannot be handled by * bridge loop avoidance */ BATADV_UEV_LOOPDETECT, }; /** * enum batadv_uev_type - Type of uevent */ enum batadv_uev_type { /** @BATADV_UEV_GW: selected gateway was modified */ BATADV_UEV_GW = 0, /** @BATADV_UEV_BLA: bridge loop avoidance event */ BATADV_UEV_BLA, }; #define BATADV_GW_THRESHOLD 50 /* Number of fragment chains for each orig_node */ #define BATADV_FRAG_BUFFER_COUNT 8 /* Maximum number of fragments for one packet */ #define BATADV_FRAG_MAX_FRAGMENTS 16 /* Maxumim size of each fragment */ #define BATADV_FRAG_MAX_FRAG_SIZE 1280 /* Time to keep fragments while waiting for rest of the fragments */ #define BATADV_FRAG_TIMEOUT 10000 #define BATADV_DAT_CANDIDATE_NOT_FOUND 0 #define BATADV_DAT_CANDIDATE_ORIG 1 /* Debug Messages */ #ifdef pr_fmt #undef pr_fmt #endif /* Append 'batman-adv: ' before kernel messages */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* Kernel headers */ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "types.h" #include "main.h" /** * batadv_print_vid() - return printable version of vid information * @vid: the VLAN identifier * * Return: -1 when no VLAN is used, VLAN id otherwise */ static inline int batadv_print_vid(unsigned short vid) { if (vid & BATADV_VLAN_HAS_TAG) return (int)(vid & VLAN_VID_MASK); else return -1; } extern struct list_head batadv_hardif_list; extern unsigned int batadv_hardif_generation; extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); int batadv_max_header_len(void); void batadv_skb_set_priority(struct sk_buff *skb, int offset); int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); int batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses * @data1: Pointer to a six-byte array containing the Ethernet address * @data2: Pointer other six-byte array containing the Ethernet address * * note: can't use ether_addr_equal() as it requires aligned memory * * Return: true if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { return ether_addr_equal_unaligned(data1, data2); } /** * batadv_has_timed_out() - compares current time (jiffies) and timestamp + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * * Return: true if current time is after timestamp + timeout */ static inline bool batadv_has_timed_out(unsigned long timestamp, unsigned int timeout) { return time_is_before_jiffies(timestamp + msecs_to_jiffies(timeout)); } /** * batadv_atomic_dec_not_zero() - Decrease unless the number is 0 * @v: pointer of type atomic_t * * Return: non-zero if v was not 0, and zero otherwise. */ #define batadv_atomic_dec_not_zero(v) atomic_add_unless((v), -1, 0) /** * batadv_smallest_signed_int() - Returns the smallest signed integer in two's * complement with the sizeof x * @x: type of integer * * Return: smallest signed integer of type */ #define batadv_smallest_signed_int(x) (1u << (7u + 8u * (sizeof(x) - 1u))) /** * batadv_seq_before() - Checks if a sequence number x is a predecessor of y * @x: potential predecessor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a predecessor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a predecessor of y, false otherwise */ #define batadv_seq_before(x, y) ({ \ typeof(x)_d1 = (x); \ typeof(y)_d2 = (y); \ typeof(x)_dummy = (_d1 - _d2); \ (void)(&_d1 == &_d2); \ _dummy > batadv_smallest_signed_int(_dummy); \ }) /** * batadv_seq_after() - Checks if a sequence number x is a successor of y * @x: potential successor of @y * @y: value to compare @x against * * It handles overflows/underflows and can correctly check for a successor * unless the variable sequence number has grown by more than * 2**(bitwidth(x)-1)-1. * * This means that for a u8 with the maximum value 255, it would think: * * * when adding nothing - it is neither a predecessor nor a successor * * before adding more than 127 to the starting value - it is a predecessor, * * when adding 128 - it is neither a predecessor nor a successor, * * after adding more than 127 to the starting value - it is a successor * * Return: true when x is a successor of y, false otherwise */ #define batadv_seq_after(x, y) batadv_seq_before(y, x) /** * batadv_add_counter() - Add to per cpu statistics counter of soft interface * @bat_priv: the bat priv with all the soft interface information * @idx: counter index which should be modified * @count: value to increase counter by * * Stop preemption on local cpu while incrementing the counter */ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, size_t count) { this_cpu_add(bat_priv->bat_counters[idx], count); } /** * batadv_inc_counter() - Increase per cpu statistics counter of soft interface * @b: the bat priv with all the soft interface information * @i: counter index which should be modified */ #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) /** * BATADV_SKB_CB() - Get batadv_skb_cb from skb control buffer * @__skb: skb holding the control buffer * * The members of the control buffer are defined in struct batadv_skb_cb in * types.h. The macro is inspired by the similar macro TCP_SKB_CB() in tcp.h. * * Return: pointer to the batadv_skb_cb of the skb */ #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, enum batadv_uev_action action, const char *data); #endif /* _NET_BATMAN_ADV_MAIN_H_ */ |
6 24 2 9 24 3 1 21 22 4 18 19 21 18 1 1 7 4 6 2 3 1 1 2 5 3 1 7 5 2 24 9 4 4 3 3 10 24 24 13 30 3 2 4 25 8 1 24 8 26 2 24 3 24 6 3 1 25 2 2 23 5 24 23 1 2 1 2 2 1 12 28 1 29 30 1 24 5 36 3 32 36 33 32 32 7 24 1 4 3 6 2 5 1 6 4 31 5 2 31 1 32 32 6 3 29 31 5 30 5 2 24 5 1 4 3 2 3 3 2 3 3 2 1 2 1 2 1 1 2 2 1 2 1 2 1 2 1 2 1 2 1 2 1 1 1 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2002-2013 Jozsef Kadlecsik <kadlec@netfilter.org> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <linux/unaligned.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", }; enum nf_ct_tcp_action { NFCT_TCP_IGNORE, NFCT_TCP_INVALID, NFCT_TCP_ACCEPT, }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ [TCP_CONNTRACK_RETRANS] = 5 MINS, [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sS2 -> sS2 Late retransmitted SYN * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open * sES -> sIV Invalid SYN/ACK packets sent by the client * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, /* * sNO -> sIV Never reached. * sSS -> sS2 Simultaneous open * sS2 -> sS2 Retransmitted simultaneous SYN * sSR -> sIV Invalid SYN packets sent by the server * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sSS Reopened connection, but server may have switched role * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) return; seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.sane.nl/events/sane2000/papers.html http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be after the right or before the left edge) and thus receivers may ACK segments after the right edge of the window. td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_maxwin += seq + len - sender.td_maxend if seq + len > sender.td_maxend td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin III. Upper bound for valid (s)ack: sack <= receiver.td_end IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet or ack in the case of packet without SACK option. The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ return (seq + len - dataoff - tcph->doff*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) state->flags |= IP_CT_TCP_FLAG_SACK_PERM; else if (opcode == TCPOPT_WINDOW && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; if (state->td_scale > TCP_MAX_WSCALE) state->td_scale = TCP_MAX_WSCALE; state->flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; } ptr += opsize - 2; length -= opsize; } } } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); if (!ptr) return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) return; while (length > 0) { int opcode = *ptr++; int opsize, i; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) { for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; } return; } ptr += opsize - 2; length -= opsize; } } } static void tcp_init_sender(struct ip_ct_tcp_state *sender, struct ip_ct_tcp_state *receiver, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, u32 end, u32 win, enum ip_conntrack_dir dir) { /* SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); /* RFC 1323: * Both sides must send the Window Scale option * to enable window scaling in either direction. */ if (dir == IP_CT_DIR_REPLY && !(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) { sender->td_scale = 0; receiver->td_scale = 0; } } __printf(6, 7) static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const struct ip_ct_tcp_state *sender, enum nf_ct_tcp_action ret, const char *fmt, ...) { const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct)); struct va_format vaf; va_list args; bool be_liberal; be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal; if (be_liberal) return NFCT_TCP_ACCEPT; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf); va_end(args); return ret; } static enum nf_ct_tcp_action tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, const struct nf_hook_state *hook_state) { struct ip_ct_tcp *state = &ct->proto.tcp; struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; __u32 seq, ack, sack, end, win, swin; bool in_recv_win, seq_ok; s32 receiver_offset; u16 win_raw; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); win_raw = ntohs(tcph->window); win = win_raw; end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; if (sender->td_maxwin == 0) { /* * Initialize sender data. */ if (tcph->syn) { tcp_init_sender(sender, receiver, skb, dataoff, tcph, end, win, dir); if (!tcph->ack) /* Simultaneous open */ return NFCT_TCP_ACCEPT; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ sender->td_end = end; swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; if (receiver->td_maxwin == 0) { /* We haven't seen traffic in the other * direction yet but we have to tweak window * tracking to pass III and IV until that * happens. */ receiver->td_end = receiver->td_maxend = sack; } else if (sack == receiver->td_end + 1) { /* Likely a reply to a keepalive. * Needed for III. */ receiver->td_end++; } } } else if (tcph->syn && after(end, sender->td_end) && (state->state == TCP_CONNTRACK_SYN_SENT || state->state == TCP_CONNTRACK_SYN_RECV)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." * * Re-init state for this direction, just like for the first * syn(-ack) reply, it might differ in seq, ack or tcp options. */ tcp_init_sender(sender, receiver, skb, dataoff, tcph, end, win, dir); if (dir == IP_CT_DIR_REPLY && !tcph->ack) return NFCT_TCP_ACCEPT; } if (!(tcph->ack)) { /* * If there is no ACK, just pretend it was set and OK. */ ack = sack = receiver->td_end; } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == (TCP_FLAG_ACK|TCP_FLAG_RST)) && (ack == 0)) { /* * Broken TCP stacks, that set ACK in RST packets as well * with zero ack value. */ ack = sack = receiver->td_end; } if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* * RST sent answering SYN. */ seq = end = sender->td_end; seq_ok = before(seq, sender->td_maxend + 1); if (!seq_ok) { u32 overshot = end - sender->td_maxend + 1; bool ack_ok; ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); in_recv_win = receiver->td_maxwin && after(end, sender->td_end - receiver->td_maxwin - 1); if (in_recv_win && ack_ok && overshot <= receiver->td_maxwin && before(sack, receiver->td_end + 1)) { /* Work around TCPs that send more bytes than allowed by * the receive window. * * If the (marked as invalid) packet is allowed to pass by * the ruleset and the peer acks this data, then its possible * all future packets will trigger 'ACK is over upper bound' check. * * Thus if only the sequence check fails then do update td_end so * possible ACK for this data can update internal state. */ sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "%u bytes more than expected", overshot); } return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, "SEQ is over upper bound %u (over the window of the receiver)", sender->td_maxend + 1); } if (!before(sack, receiver->td_end + 1)) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, "ACK is over upper bound %u (ACKed data not seen yet)", receiver->td_end + 1); /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); if (!in_recv_win) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "SEQ is under lower bound %u (already ACKed data retransmitted)", sender->td_end - receiver->td_maxwin - 1); if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, "ignored ACK under lower bound %u (possible overly delayed)", receiver->td_end - MAXACKWINDOW(sender) - 1); /* Take into account window scaling (RFC 1323). */ if (!tcph->syn) win <<= sender->td_scale; /* Update sender data. */ swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; if (after(end, sender->td_end)) { sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; } if (tcph->ack) { if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { sender->td_maxack = ack; sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; } else if (after(ack, sender->td_maxack)) { sender->td_maxack = ack; } } /* Update receiver data. */ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } if (ack == receiver->td_end) receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* Check retransmissions. */ if (index == TCP_ACK_SET) { if (state->last_dir == dir && state->last_seq == seq && state->last_ack == ack && state->last_end == end && state->last_win == win_raw) { state->retrans++; } else { state->last_dir = dir; state->last_seq = seq; state->last_ack = ack; state->last_end = end; state->last_win = win_raw; state->retrans = 0; } } return NFCT_TCP_ACCEPT; } static void __cold nf_tcp_handle_invalid(struct nf_conn *ct, enum ip_conntrack_dir dir, int index, const struct sk_buff *skb, const struct nf_hook_state *hook_state) { const unsigned int *timeouts; const struct nf_tcp_net *tn; unsigned int timeout; u32 expires; if (!test_bit(IPS_ASSURED_BIT, &ct->status) || test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) return; /* We don't want to have connections hanging around in ESTABLISHED * state for long time 'just because' conntrack deemed a FIN/RST * out-of-window. * * Shrink the timeout just like when there is unacked data. * This speeds up eviction of 'dead' connections where the * connection and conntracks internal state are out of sync. */ switch (index) { case TCP_RST_SET: case TCP_FIN_SET: break; default: return; } if (ct->proto.tcp.last_dir != dir && (ct->proto.tcp.last_index == TCP_FIN_SET || ct->proto.tcp.last_index == TCP_RST_SET)) { expires = nf_ct_expires(ct); if (expires < 120 * HZ) return; tn = nf_tcp_pernet(nf_ct_net(ct)); timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]); if (expires > timeout) { nf_ct_l4proto_log_invalid(skb, ct, hook_state, "packet (index %d, dir %d) response for index %d lower timeout to %u", index, dir, ct->proto.tcp.last_index, timeout); WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); } } else { ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; } } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| TCPHDR_URG) + 1] = { [TCPHDR_SYN] = 1, [TCPHDR_SYN|TCPHDR_URG] = 1, [TCPHDR_SYN|TCPHDR_ACK] = 1, [TCPHDR_RST] = 1, [TCPHDR_RST|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, [TCPHDR_ACK] = 1, [TCPHDR_ACK|TCPHDR_URG] = 1, }; static void tcp_error_log(const struct sk_buff *skb, const struct nf_hook_state *state, const char *msg) { nf_l4proto_log_invalid(skb, state, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static bool tcp_error(const struct tcphdr *th, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state) { unsigned int tcplen = skb->len - dataoff; u8 tcpflags; /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { tcp_error_log(skb, state, "truncated packet"); return true; } /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ if (state->net->ct.sysctl_checksum && state->hook == NF_INET_PRE_ROUTING && nf_checksum(skb, state->hook, dataoff, IPPROTO_TCP, state->pf)) { tcp_error_log(skb, state, "bad checksum"); return true; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { tcp_error_log(skb, state, "invalid tcp flag combination"); return true; } return false; } static noinline bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *th, const struct nf_hook_state *state) { enum tcp_conntrack new_state; struct net *net = nf_ct_net(ct); const struct nf_tcp_net *tn = nf_tcp_pernet(net); /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { tcp_error_log(skb, state, "invalid new"); return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; /* We assume SACK and liberal window checking to handle * window scaling */ ct->proto.tcp.seen[0].flags = ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | IP_CT_TCP_FLAG_BE_LIBERAL; } /* tcp_packet will set them */ ct->proto.tcp.last_index = TCP_NONE_SET; return true; } static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: case TCP_CONNTRACK_CLOSE: case TCP_CONNTRACK_CLOSE_WAIT: return true; default: break; } return false; } void nf_conntrack_tcp_set_closing(struct nf_conn *ct) { enum tcp_conntrack old_state; const unsigned int *timeouts; u32 timeout; if (!nf_ct_is_confirmed(ct)) return; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; ct->proto.tcp.state = TCP_CONNTRACK_CLOSE; if (old_state == TCP_CONNTRACK_CLOSE || test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { spin_unlock_bh(&ct->lock); return; } timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) { const struct nf_tcp_net *tn; tn = nf_tcp_pernet(nf_ct_net(ct)); timeouts = tn->timeouts; } timeout = timeouts[TCP_CONNTRACK_CLOSE]; WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); spin_unlock_bh(&ct->lock); nf_conntrack_event_cache(IPCT_PROTOINFO, ct); } static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; state->td_maxend = 0; state->td_maxwin = 0; state->td_maxack = 0; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Returns verdict for packet, or -1 for invalid. */ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = nf_tcp_pernet(net); enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum nf_ct_tcp_action res; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) return -NF_ACCEPT; if (tcp_error(th, skb, dataoff, state)) return -NF_ACCEPT; if (!nf_ct_is_confirmed(ct) && !tcp_new(ct, skb, dataoff, th, state)) return -NF_ACCEPT; spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: if (old_state < TCP_CONNTRACK_TIME_WAIT) break; /* RFC 1122: "When a connection is closed actively, * it MUST linger in TIME-WAIT state for a time 2xMSL * (Maximum Segment Lifetime). However, it MAY accept * a new SYN from the remote TCP to reopen the connection * directly from TIME-WAIT state, if..." * We ignore the conditions because we are in the * TIME-WAIT state anyway. * * Handle aborted connections: we and the server * think there is an existing connection but the client * aborts it and starts a new one. */ if (((ct->proto.tcp.seen[dir].flags | ct->proto.tcp.seen[!dir].flags) & IP_CT_TCP_FLAG_CLOSE_INIT) || (ct->proto.tcp.last_dir == dir && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ spin_unlock_bh(&ct->lock); /* Only repeat if we can actually remove the timer. * Destruction may already be in progress in process * context and we must give it a chance to terminate. */ if (nf_ct_kill(ct)) return -NF_REPEAT; return NF_DROP; } fallthrough; case TCP_CONNTRACK_IGNORE: /* Ignored packets: * * Our connection entry may be out of sync, so ignore * packets which may signal the real connection between * the client and the server. * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. * * If the ignored packet is invalid, the receiver will send * a RST we'll catch below. */ if (index == TCP_SYNACK_SET && ct->proto.tcp.last_index == TCP_SYN_SET && ct->proto.tcp.last_dir != dir && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is * not. We get in sync from the previously annotated * values. */ old_state = TCP_CONNTRACK_SYN_SENT; new_state = TCP_CONNTRACK_SYN_RECV; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = ct->proto.tcp.last_win == 0 ? 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.last_seq = ntohl(th->seq); ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.last_win = ntohs(th->window); /* a) This is a SYN in ORIGINAL. The client and the server * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and * then we'll get in sync. Otherwise, the server potentially * responds with a challenge ACK if implementing RFC5961. */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; ct->proto.tcp.last_flags = ct->proto.tcp.last_wscale = 0; tcp_options(skb, dataoff, th, &seen); if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; ct->proto.tcp.last_wscale = seen.td_scale; } if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } /* Mark the potential for RFC5961 challenge ACK, * this pose a special problem for LAST_ACK state * as ACK is intrepretated as ACKing last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK) ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } /* possible challenge ack reply to syn */ if (old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_REPLY) ct->proto.tcp.last_ack = ntohl(th->ack_seq); spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d ignored, state %s", index, dir, tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or * the SYN/ACK from the server is lost, the client may transmit * a keep-alive packet while in SYN_SENT state. This needs to * be associated with the original conntrack entry in order to * generate a new SYN with the correct sequence number. */ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); spin_unlock_bh(&ct->lock); return NF_ACCEPT; } /* Invalid packet */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "packet (index %d) in dir %d invalid, state %s", index, dir, tcp_conntrack_names[old_state]); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" * e.g. in response to spurious SYNs. Conntrack MUST * not believe this ACK is acking last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK && index == TCP_ACK_SET && ct->proto.tcp.last_dir != dir && ct->proto.tcp.last_index == TCP_SYN_SET && (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; case TCP_CONNTRACK_SYN_SENT2: /* tcp_conntracks table is not smart enough to handle * simultaneous open. */ ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; break; case TCP_CONNTRACK_SYN_RECV: if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) new_state = TCP_CONNTRACK_ESTABLISHED; break; case TCP_CONNTRACK_CLOSE: if (index != TCP_RST_SET) break; /* If we are closing, tuple might have been re-used already. * last_index, last_ack, and all other ct fields used for * sequence/window validation are outdated in that case. * * As the conntrack can already be expired by GC under pressure, * just skip validation checks. */ if (tcp_can_early_drop(ct)) goto in_window; /* td_maxack might be outdated if we let a SYN through earlier */ if ((ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) && ct->proto.tcp.last_index != TCP_SYN_SET) { u32 seq = ntohl(th->seq); /* If we are not in established state and SEQ=0 this is most * likely an answer to a SYN we let go through above (last_index * can be updated due to out-of-order ACKs). */ if (seq == 0 && !nf_conntrack_tcp_established(ct)) break; if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) && !tn->tcp_ignore_invalid_rst) { /* Invalid RST */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst"); return -NF_ACCEPT; } if (!nf_conntrack_tcp_established(ct) || seq == ct->proto.tcp.seen[!dir].td_maxack) break; /* Check if rst is part of train, such as * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42 */ if (ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == dir && seq == ct->proto.tcp.last_end) break; /* ... RST sequence number doesn't match exactly, keep * established state to allow a possible challenge ACK. */ new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then * c) we hold a half-open connection. * * Delete our connection entry. * We skip window checking, because packet might ACK * segments we ignored. */ goto in_window; } /* Reset in response to a challenge-ack we let through earlier */ if (old_state == TCP_CONNTRACK_SYN_SENT && ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == IP_CT_DIR_REPLY && ntohl(th->seq) == ct->proto.tcp.last_ack) goto in_window; break; default: /* Keep compilers happy. */ break; } res = tcp_in_window(ct, dir, index, skb, dataoff, th, state); switch (res) { case NFCT_TCP_IGNORE: spin_unlock_bh(&ct->lock); return NF_ACCEPT; case NFCT_TCP_INVALID: nf_tcp_handle_invalid(ct, dir, index, skb, state); spin_unlock_bh(&ct->lock); return -NF_ACCEPT; case NFCT_TCP_ACCEPT: break; } in_window: /* From now on we have got in-window packets */ ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else if (unlikely(index == TCP_RST_SET)) timeout = timeouts[TCP_CONNTRACK_CLOSE]; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; else if (ct->proto.tcp.last_win == 0 && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } if (index == TCP_SYN_SET && old_state == TCP_CONNTRACK_SYN_SENT) { /* do not renew timeout on SYN retransmit. * * Else port reuse by client or NAT middlebox can keep * entry alive indefinitely (including nat info). */ return NF_ACCEPT; } /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ if (new_state == TCP_CONNTRACK_ESTABLISHED && timeout > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_ASSURED, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state)) goto nla_put_failure; if (destroy) goto skip_state; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; skip_state: spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ if (!pattr) return 0; err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy, NULL); if (err < 0) return err; if (tb[CTA_PROTOINFO_TCP_STATE] && nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; spin_lock_bh(&ct->lock); if (tb[CTA_PROTOINFO_TCP_STATE]) ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); ct->proto.tcp.seen[0].flags &= ~attr->mask; ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); ct->proto.tcp.seen[1].flags &= ~attr->mask; ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.seen[0].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); ct->proto.tcp.seen[1].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } spin_unlock_bh(&ct->lock); return 0; } static unsigned int tcp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_tcp_net *tn = nf_tcp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) timeouts[i] = tn->timeouts[i]; if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { timeouts[TCP_CONNTRACK_SYN_SENT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { timeouts[TCP_CONNTRACK_SYN_RECV] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; } if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { timeouts[TCP_CONNTRACK_ESTABLISHED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; } if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { timeouts[TCP_CONNTRACK_FIN_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { timeouts[TCP_CONNTRACK_CLOSE_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { timeouts[TCP_CONNTRACK_LAST_ACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; } if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { timeouts[TCP_CONNTRACK_TIME_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE]) { timeouts[TCP_CONNTRACK_CLOSE] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { timeouts[TCP_CONNTRACK_SYN_SENT2] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; } if (tb[CTA_TIMEOUT_TCP_RETRANS]) { timeouts[TCP_CONNTRACK_RETRANS] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; } if (tb[CTA_TIMEOUT_TCP_UNACK]) { timeouts[TCP_CONNTRACK_UNACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; } timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; return 0; } static int tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_tcp_init_net(struct net *net) { struct nf_tcp_net *tn = nf_tcp_pernet(net); int i; for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) tn->timeouts[i] = tcp_timeouts[i]; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; /* If it is set to zero, we disable picking up already established * connections. */ tn->tcp_loose = 1; /* "Be conservative in what you do, * be liberal in what you accept from others." * If it's non-zero, we mark only out of window RST segments as INVALID. */ tn->tcp_be_liberal = 0; /* If it's non-zero, we turn off RST sequence number check */ tn->tcp_ignore_invalid_rst = 0; /* Max number of the retransmitted packets without receiving an (acceptable) * ACK from the destination. If this number is reached, a shorter timer * will be started. */ tn->tcp_max_retrans = 3; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) tn->offload_timeout = 30 * HZ; #endif } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = { .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_TCP_MAX, .obj_size = sizeof(unsigned int) * TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
2 4 5 4 1 9 3 2 1 3 1 2 3 1 2 6 1 5 2 1 1 11 11 2 1 13 17 1 1 1 2 1 14 13 1 13 13 11 2 7 8 1 3 7 7 5 2 4 2 2 1 1 2 1 1 5 1 1 1 1 1 1 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 | /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; } |
40 18 163 86 2 1 5 1 1 2 4 1 1 1 2 22 3 8 3 7 6 1 2 1 2 5 2 7 10 12 3 6 17 7 3 10 29 30 11 10 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <net/inet_sock.h> #include <net/dsfield.h> #include <net/checksum.h> enum { INET_ECN_NOT_ECT = 0, INET_ECN_ECT_1 = 1, INET_ECN_ECT_0 = 2, INET_ECN_CE = 3, INET_ECN_MASK = 3, }; extern int sysctl_tunnel_ecn_log; static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_CE; } static inline int INET_ECN_is_not_ect(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT; } static inline int INET_ECN_is_capable(__u8 dsfield) { return dsfield & INET_ECN_ECT_0; } /* * RFC 3168 9.1.1 * The full-functionality option for ECN encapsulation is to copy the * ECN codepoint of the inside header to the outside header on * encapsulation if the inside header is not-ECT or ECT, and to set the * ECN codepoint of the outside header to ECT(0) if the ECN codepoint of * the inside header is CE. */ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~INET_ECN_MASK; outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) : INET_ECN_ECT_0; return outer; } static inline void INET_ECN_xmit(struct sock *sk) { inet_sk(sk)->tos |= INET_ECN_ECT_0; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } static inline void INET_ECN_dontxmit(struct sock *sk) { inet_sk(sk)->tos &= ~INET_ECN_MASK; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass &= ~INET_ECN_MASK; } #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ } while (0) #define IP6_ECN_flow_xmit(sk, label) do { \ if (INET_ECN_is_capable(inet6_sk(sk)->tclass)) \ (label) |= htonl(INET_ECN_ECT_0 << 20); \ } while (0) static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 ecn = (iph->tos + 1) & INET_ECN_MASK; __be16 check_add; /* * After the last operation we have (in binary): * INET_ECN_NOT_ECT => 01 * INET_ECN_ECT_1 => 10 * INET_ECN_ECT_0 => 11 * INET_ECN_CE => 00 */ if (!(ecn & 2)) return !ecn; /* * The following gives us: * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ check_add = (__force __be16)((__force u16)htons(0xFFFB) + (__force u16)htons(ecn)); iph->check = csum16_add(iph->check, check_add); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; iph->check = csum16_add(iph->check, htons(0x1)); iph->tos ^= INET_ECN_MASK; return 1; } static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; } static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner) { dscp &= ~INET_ECN_MASK; ipv4_change_dsfield(inner, INET_ECN_MASK, dscp); } struct ipv6hdr; /* Note: * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE, * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE * In IPv6 case, no checksum compensates the change in IPv6 header, * so we have to update skb->csum. */ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) return 0; from = *(__be32 *)iph; to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; from = *(__be32 *)iph; to = from ^ htonl(INET_ECN_MASK << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); break; } return 0; } static inline int skb_get_dsfield(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) break; return ipv4_get_dsfield(ip_hdr(skb)); case cpu_to_be16(ETH_P_IPV6): if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) break; return ipv6_get_dsfield(ipv6_hdr(skb)); } return -1; } static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ect1(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); break; } return 0; } /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant * tunnel egress MUST set the outgoing ECN field to the codepoint at the * intersection of the appropriate arriving inner header (row) and outer * header (column) in Figure 4 * * +---------+------------------------------------------------+ * |Arriving | Arriving Outer Header | * | Inner +---------+------------+------------+------------+ * | Header | Not-ECT | ECT(0) | ECT(1) | CE | * +---------+---------+------------+------------+------------+ * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | * | CE | CE | CE | CE(!!!)| CE | * +---------+---------+------------+------------+------------+ * * Figure 4: New IP in IP Decapsulation Behaviour * * returns 0 on success * 1 if something is broken and should be logged (!!! above) * 2 if packet should be dropped */ static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce) { if (INET_ECN_is_not_ect(inner)) { switch (outer & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; case INET_ECN_ECT_0: case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; } } *set_ce = INET_ECN_is_ce(outer); return 0; } static inline int INET_ECN_decapsulate(struct sk_buff *skb, __u8 outer, __u8 inner) { bool set_ce = false; int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); if (!rc) { if (set_ce) INET_ECN_set_ce(skb); else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) INET_ECN_set_ect1(skb); } return rc; } static inline int IP_ECN_decapsulate(const struct iphdr *oiph, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, oiph->tos, inner); } static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } #endif |
4 4 3 3 4 4 11540 11556 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 | // SPDX-License-Identifier: GPL-2.0 /* * SafeSetID Linux Security Module * * Author: Micah Morton <mortonm@chromium.org> * * Copyright (C) 2018 The Chromium OS Authors. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, as * published by the Free Software Foundation. * */ #define pr_fmt(fmt) "SafeSetID: " fmt #include <linux/lsm_hooks.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <uapi/linux/lsm.h> #include "lsm.h" /* Flag indicating whether initialization completed */ int safesetid_initialized __initdata; struct setid_ruleset __rcu *safesetid_setuid_rules; struct setid_ruleset __rcu *safesetid_setgid_rules; /* Compute a decision for a transition from @src to @dst under @policy. */ enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy, kid_t src, kid_t dst) { struct setid_rule *rule; enum sid_policy_type result = SIDPOL_DEFAULT; if (policy->type == UID) { hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) { if (!uid_eq(rule->src_id.uid, src.uid)) continue; if (uid_eq(rule->dst_id.uid, dst.uid)) return SIDPOL_ALLOWED; result = SIDPOL_CONSTRAINED; } } else if (policy->type == GID) { hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) { if (!gid_eq(rule->src_id.gid, src.gid)) continue; if (gid_eq(rule->dst_id.gid, dst.gid)){ return SIDPOL_ALLOWED; } result = SIDPOL_CONSTRAINED; } } else { /* Should not reach here, report the ID as contrainsted */ result = SIDPOL_CONSTRAINED; } return result; } /* * Compute a decision for a transition from @src to @dst under the active * policy. */ static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type) { enum sid_policy_type result = SIDPOL_DEFAULT; struct setid_ruleset *pol; rcu_read_lock(); if (new_type == UID) pol = rcu_dereference(safesetid_setuid_rules); else if (new_type == GID) pol = rcu_dereference(safesetid_setgid_rules); else { /* Should not reach here */ result = SIDPOL_CONSTRAINED; rcu_read_unlock(); return result; } if (pol) { pol->type = new_type; result = _setid_policy_lookup(pol, src, dst); } rcu_read_unlock(); return result; } static int safesetid_security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { /* We're only interested in CAP_SETUID and CAP_SETGID. */ if (cap != CAP_SETUID && cap != CAP_SETGID) return 0; /* * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we * want to let it go through here; the real security check happens later, in * the task_fix_set{u/g}id or task_fix_setgroups hooks. */ if ((opts & CAP_OPT_INSETID) != 0) return 0; switch (cap) { case CAP_SETUID: /* * If no policy applies to this task, allow the use of CAP_SETUID for * other purposes. */ if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*uid() (e.g. setting up userns uid mappings). */ pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for * other purposes. */ if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; /* * Reject use of CAP_SETUID for functionality other than calling * set*gid() (e.g. setting up userns gid mappings). */ pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kgid_val(cred->gid)); return -EPERM; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; } return 0; } /* * Check whether a caller with old credentials @old is allowed to switch to * credentials that contain @new_id. */ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type) { bool permitted; /* If our old creds already had this ID in it, it's fine. */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; } else /* Error, new_type is an invalid type */ return false; /* * Transitions to new UIDs require a check against the policy of the old * RUID. */ permitted = setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; if (!permitted) { if (new_type == UID) { pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", __kuid_val(old->uid), __kuid_val(old->euid), __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID) { pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); } else /* Error, new_type is an invalid type */ return false; } return permitted; } /* * Check whether there is either an exception for user under old cred struct to * set*uid to user under new cred struct, or the UID transition is allowed (by * Linux set*uid rules) even without CAP_SETUID. */ static int safesetid_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setuid restrictions for our old RUID. */ if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) && id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) && id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID)) return 0; /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old) { int i; /* Do nothing if there are no setgid restrictions for our old RGID. */ if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT) return 0; get_group_info(new->group_info); for (i = 0; i < new->group_info->ngroups; i++) { if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) { put_group_info(new->group_info); /* * Kill this process to avoid potential security vulnerabilities * that could arise from a missing allowlist entry preventing a * privileged process from dropping to a lesser-privileged one. */ force_sig(SIGKILL); return -EACCES; } } put_group_info(new->group_info); return 0; } static const struct lsm_id safesetid_lsmid = { .name = "safesetid", .id = LSM_ID_SAFESETID, }; static struct security_hook_list safesetid_security_hooks[] = { LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid), LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid), LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups), LSM_HOOK_INIT(capable, safesetid_security_capable) }; static int __init safesetid_security_init(void) { security_add_hooks(safesetid_security_hooks, ARRAY_SIZE(safesetid_security_hooks), &safesetid_lsmid); /* Report that SafeSetID successfully initialized */ safesetid_initialized = 1; return 0; } DEFINE_LSM(safesetid_security_init) = { .init = safesetid_security_init, .name = "safesetid", }; |
48 286 102 20 148 124 1 5 6 6 112 26 26 24 2 53 52 17 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Authors: Lotsa people, from code originally in tcp */ #ifndef _INET_HASHTABLES_H #define _INET_HASHTABLES_H #include <linux/interrupt.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> #include <net/inet_connection_sock.h> #include <net/inet_sock.h> #include <net/ip.h> #include <net/sock.h> #include <net/route.h> #include <net/tcp_states.h> #include <net/netns/hash.h> #include <linux/refcount.h> #include <asm/byteorder.h> /* This is for all connections with a full identity, no wildcards. * The 'e' prefix stands for Establish, but we really put all sockets * but LISTEN ones. */ struct inet_ehash_bucket { struct hlist_nulls_head chain; }; /* There are a few simple rules, which allow for local port reuse by * an application. In essence: * * 1) Sockets bound to different interfaces may share a local port. * Failing that, goto test 2. * 2) If all sockets have sk->sk_reuse set, and none of them are in * TCP_LISTEN state, the port may be shared. * Failing that, goto test 3. * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local * address, and none of them are the same, the port may be * shared. * Failing this, the port cannot be shared. * * The interesting point, is test #2. This is what an FTP server does * all day. To optimize this case we use a specific flag bit defined * below. As we add sockets to a bind bucket list, we perform a * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN)) * As long as all sockets added to a bind bucket pass this test, * the flag bit will be set. * The resulting situation is that tcp_v[46]_verify_bind() can just check * for this flag bit, if it is set and the socket trying to bind has * sk->sk_reuse set, we don't even have to walk the owners list at all, * we return that it is ok to bind this socket to the requested local port. * * Sounds like a lot of work, but it is worth it. In a more naive * implementation (ie. current FreeBSD etc.) the entire list of ports * must be walked for each data port opened by an ftp server. Needless * to say, this does not scale at all. With a couple thousand FTP * users logged onto your box, isn't it nice to know that new data * ports are created in O(1) time? I thought so. ;-) -DaveM */ #define FASTREUSEPORT_ANY 1 #define FASTREUSEPORT_STRICT 2 struct inet_bind_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr fast_v6_rcv_saddr; #endif __be32 fast_rcv_saddr; unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; }; struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else __be32 rcv_saddr; #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) { return read_pnet(&ib->ib_net); } static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } #define inet_bind_bucket_for_each(tb, head) \ hlist_for_each_entry(tb, head, node) struct inet_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Sockets can be hashed in established or listening table. * We must use different 'nulls' end-of-chain value for all hash buckets : * A socket might transition from ESTABLISH to LISTEN state without * RCU grace period. A lookup in ehash table needs to handle this case. */ #define LISTENING_NULLS_BASE (1U << 29) struct inet_listen_hashbucket { spinlock_t lock; struct hlist_nulls_head nulls_head; }; /* This is for listening sockets, thus all sockets which possess wildcards. */ #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ struct inet_hashinfo { /* This is for sockets with full identity only. Sockets here will * always be without wildcards and will have the following invariant: * * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE * */ struct inet_ehash_bucket *ehash; spinlock_t *ehash_locks; unsigned int ehash_mask; unsigned int ehash_locks_mask; /* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; struct kmem_cache *bind2_bucket_cachep; /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used * primarily for expediting bind conflict resolution. */ struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; bool pernet; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { #if IS_ENABLED(CONFIG_IP_DCCP) return sk->sk_prot->h.hashinfo ? : sock_net(sk)->ipv4.tcp_death_row.hashinfo; #else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; #endif } static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { return &h->lhash2[hash & h->lhash2_mask]; } static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash[hash & hashinfo->ehash_mask]; } static inline spinlock_t *inet_ehash_lockp( struct inet_hashinfo *hashinfo, unsigned int hash) { return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask]; } int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) { kfree(h->lhash2); h->lhash2 = NULL; } static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); hashinfo->ehash_locks = NULL; } struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, int l3mdev); struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb); struct inet_bind2_bucket * inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, int l3mdev, const struct sock *sk); static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } static inline struct inet_bind_hashbucket * inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, const struct net *net, unsigned short port) { u32 hash; #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); else #endif hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); /* This should be called whenever a socket's sk_rcv_saddr (ipv4) or * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's * rcv_saddr field should already have been updated when this is called. */ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family); void inet_bhash2_reset_saddr(struct sock *sk); void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); void inet_put_port(struct sock *sk); void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif) { return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } /* Socket demux engine toys. */ /* What happens here is ugly; there's a pair of adjacent fields in struct inet_sock; __be16 dport followed by __u16 num. We want to search by pair, so we combine the keys into a single 32bit value and compare with 32bit value read from &...->dport. Let's at least make sure that it's not mixed with anything else... On 64bit targets we combine comparisons with pair of adjacent __be32 fields in the same way. */ #ifdef __BIG_ENDIAN #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport))) #else /* __LITTLE_ENDIAN */ #define INET_COMBINED_PORTS(__sport, __dport) \ ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport))) #endif #ifdef __BIG_ENDIAN #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__saddr)) << 32) | \ ((__force __u64)(__be32)(__daddr))) #else /* __LITTLE_ENDIAN */ #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \ const __addrpair __name = (__force __addrpair) ( \ (((__force __u64)(__be32)(__daddr)) << 32) | \ ((__force __u64)(__be32)(__saddr))) #endif /* __BIG_ENDIAN */ static inline bool inet_match(const struct net *net, const struct sock *sk, const __addrpair cookie, const __portpair ports, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || sk->sk_portpair != ports || sk->sk_addrpair != cookie) return false; /* READ_ONCE() paired with WRITE_ONCE() in sock_bindtoindex_locked() */ return inet_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif); } /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); typedef u32 (inet_ehashfn_t)(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport); inet_ehashfn_t inet_ehashfn; INDIRECT_CALLABLE_DECLARE(inet_ehashfn_t udp_ehashfn); struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, inet_ehashfn_t *ehashfn); struct sock *inet_lookup_run_sk_lookup(const struct net *net, int protocol, struct sk_buff *skb, int doff, __be32 saddr, __be16 sport, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); static inline struct sock * inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif, const int sdif, bool *refcounted) { u16 hnum = ntohs(dport); struct sock *sk; sk = __inet_lookup_established(net, hashinfo, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } static inline struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, bool *refcounted, inet_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool prefetched; sk = skb_steal_sock(skb, refcounted, &prefetched); if (!sk) return NULL; if (!prefetched || !sk_fullsock(sk)) return sk; if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_state != TCP_LISTEN) return sk; } else if (sk->sk_protocol == IPPROTO_UDP) { if (sk->sk_state != TCP_CLOSE) return sk; } else { return sk; } reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, ntohs(dport), ehashfn); if (!reuse_sk) return sk; /* We've chosen a new reuseport sock which is never refcounted. This * implies that sk also isn't refcounted. */ WARN_ON_ONCE(*refcounted); return reuse_sk; } static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; sk = inet_steal_sock(net, skb, doff, iph->saddr, sport, iph->daddr, dport, refcounted, inet_ehashfn); if (IS_ERR(sk)) return NULL; if (sk) return sk; return __inet_lookup(net, hashinfo, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } static inline void sk_daddr_set(struct sock *sk, __be32 addr) { sk->sk_daddr = addr; /* alias of inet_daddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_daddr); #endif } static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) { sk->sk_rcv_saddr = addr; /* alias of inet_rcv_saddr */ #if IS_ENABLED(CONFIG_IPV6) ipv6_addr_set_v4mapped(addr, &sk->sk_v6_rcv_saddr); #endif } int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); #endif /* _INET_HASHTABLES_H */ |
330 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H #include <linux/skbuff.h> #include <linux/ipv6.h> #include <uapi/linux/icmpv6.h> static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) { return (struct icmp6hdr *)skb_transport_header(skb); } #include <linux/netdevice.h> #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } #else extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { __icmpv6_send(skb, type, code, info, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; __icmpv6_send(skb_in, type, code, info, &parm); } #endif #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } static inline void icmpv6_ndo_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } #endif extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); extern void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason); struct flowi6; struct in6_addr; void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_param_prob_reason(skb, code, pos, SKB_DROP_REASON_NOT_SPECIFIED); } static inline bool icmpv6_is_err(int type) { switch (type) { case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: return true; } return false; } #endif |
2 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt.h: declarations for per-file encryption * * Filesystems that implement per-file encryption must include this header * file. * * Copyright (C) 2015, Google, Inc. * * Written by Michael Halcrow, 2015. * Modified by Jaegeuk Kim, 2015. */ #ifndef _LINUX_FSCRYPT_H #define _LINUX_FSCRYPT_H #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> #include <uapi/linux/fscrypt.h> /* * The lengths of all file contents blocks must be divisible by this value. * This is needed to ensure that all contents encryption modes will work, as * some of the supported modes don't support arbitrarily byte-aligned messages. * * Since the needed alignment is 16 bytes, most filesystems will meet this * requirement naturally, as typical block sizes are powers of 2. However, if a * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via * compression), then it will need to pad to this alignment before encryption. */ #define FSCRYPT_CONTENTS_ALIGNMENT 16 union fscrypt_policy; struct fscrypt_inode_info; struct fs_parameter; struct seq_file; struct fscrypt_str { unsigned char *name; u32 len; }; struct fscrypt_name { const struct qstr *usr_fname; struct fscrypt_str disk_name; u32 hash; u32 minor_hash; struct fscrypt_str crypto_buf; bool is_nokey_name; }; #define FSTR_INIT(n, l) { .name = n, .len = l } #define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) #define fname_name(p) ((p)->disk_name.name) #define fname_len(p) ((p)->disk_name.len) /* Maximum value for the third parameter of fscrypt_operations.set_context(). */ #define FSCRYPT_SET_CONTEXT_MAX_SIZE 40 #ifdef CONFIG_FS_ENCRYPTION /* Crypto operations for filesystems */ struct fscrypt_operations { /* * If set, then fs/crypto/ will allocate a global bounce page pool the * first time an encryption key is set up for a file. The bounce page * pool is required by the following functions: * * - fscrypt_encrypt_pagecache_blocks() * - fscrypt_zeroout_range() for files not using inline crypto * * If the filesystem doesn't use those, it doesn't need to set this. */ unsigned int needs_bounce_pages : 1; /* * If set, then fs/crypto/ will allow the use of encryption settings * that assume inode numbers fit in 32 bits (i.e. * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other * prerequisites for these settings are also met. This is only useful * if the filesystem wants to support inline encryption hardware that is * limited to 32-bit or 64-bit data unit numbers and where programming * keyslots is very slow. */ unsigned int has_32bit_inodes : 1; /* * If set, then fs/crypto/ will allow users to select a crypto data unit * size that is less than the filesystem block size. This is done via * the log2_data_unit_size field of the fscrypt policy. This flag is * not compatible with filesystems that encrypt variable-length blocks * (i.e. blocks that aren't all equal to filesystem's block size), for * example as a result of compression. It's also not compatible with * the fscrypt_encrypt_block_inplace() and * fscrypt_decrypt_block_inplace() functions. */ unsigned int supports_subblock_data_units : 1; /* * This field exists only for backwards compatibility reasons and should * only be set by the filesystems that are setting it already. It * contains the filesystem-specific key description prefix that is * accepted for "logon" keys for v1 fscrypt policies. This * functionality is deprecated in favor of the generic prefix * "fscrypt:", which itself is deprecated in favor of the filesystem * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY. Filesystems that * are newly adding fscrypt support should not set this field. */ const char *legacy_key_prefix; /* * Get the fscrypt context of the given inode. * * @inode: the inode whose context to get * @ctx: the buffer into which to get the context * @len: length of the @ctx buffer in bytes * * Return: On success, returns the length of the context in bytes; this * may be less than @len. On failure, returns -ENODATA if the * inode doesn't have a context, -ERANGE if the context is * longer than @len, or another -errno code. */ int (*get_context)(struct inode *inode, void *ctx, size_t len); /* * Set an fscrypt context on the given inode. * * @inode: the inode whose context to set. The inode won't already have * an fscrypt context. * @ctx: the context to set * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE) * @fs_data: If called from fscrypt_set_context(), this will be the * value the filesystem passed to fscrypt_set_context(). * Otherwise (i.e. when called from * FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL. * * i_rwsem will be held for write. * * Return: 0 on success, -errno on failure. */ int (*set_context)(struct inode *inode, const void *ctx, size_t len, void *fs_data); /* * Get the dummy fscrypt policy in use on the filesystem (if any). * * Filesystems only need to implement this function if they support the * test_dummy_encryption mount option. * * Return: A pointer to the dummy fscrypt policy, if the filesystem is * mounted with test_dummy_encryption; otherwise NULL. */ const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb); /* * Check whether a directory is empty. i_rwsem will be held for write. */ bool (*empty_dir)(struct inode *inode); /* * Check whether the filesystem's inode numbers and UUID are stable, * meaning that they will never be changed even by offline operations * such as filesystem shrinking and therefore can be used in the * encryption without the possibility of files becoming unreadable. * * Filesystems only need to implement this function if they want to * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags. These * flags are designed to work around the limitations of UFS and eMMC * inline crypto hardware, and they shouldn't be used in scenarios where * such hardware isn't being used. * * Leaving this NULL is equivalent to always returning false. */ bool (*has_stable_inodes)(struct super_block *sb); /* * Return an array of pointers to the block devices to which the * filesystem may write encrypted file contents, NULL if the filesystem * only has a single such block device, or an ERR_PTR() on error. * * On successful non-NULL return, *num_devs is set to the number of * devices in the returned array. The caller must free the returned * array using kfree(). * * If the filesystem can use multiple block devices (other than block * devices that aren't used for encrypted file contents, such as * external journal devices), and wants to support inline encryption, * then it must implement this function. Otherwise it's not needed. */ struct block_device **(*get_devices)(struct super_block *sb, unsigned int *num_devs); }; int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). * I.e., another task may publish ->i_crypt_info concurrently, executing * a RELEASE barrier. We need to use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_crypt_info); } /** * fscrypt_needs_contents_encryption() - check whether an inode needs * contents encryption * @inode: the inode to check * * Return: %true iff the inode is an encrypted regular file and the kernel was * built with fscrypt support. * * If you need to know whether the encrypt bit is set even when the kernel was * built without fscrypt support, you must use IS_ENCRYPTED() directly instead. */ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode); } /* * When d_splice_alias() moves a directory's no-key alias to its * plaintext alias as a result of the encryption key being added, * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity * to disable d_revalidate. Note that we don't have to support the * inverse operation because fscrypt doesn't allow no-key names to be * the source or target of a rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { /* * VFS calls fscrypt_handle_d_move even for non-fscrypt * filesystems. */ if (dentry->d_flags & DCACHE_NOKEY_NAME) { dentry->d_flags &= ~DCACHE_NOKEY_NAME; /* * Other filesystem features might be handling dentry * revalidation, in which case it cannot be disabled. */ if (dentry->d_op->d_revalidate == fscrypt_d_revalidate) dentry->d_flags &= ~DCACHE_OP_REVALIDATE; } } /** * fscrypt_is_nokey_name() - test whether a dentry is a no-key name * @dentry: the dentry to check * * This returns true if the dentry is a no-key dentry. A no-key dentry is a * dentry that was created in an encrypted directory that hasn't had its * encryption key added yet. Such dentries may be either positive or negative. * * When a filesystem is asked to create a new filename in an encrypted directory * and the new filename's dentry is a no-key dentry, it must fail the operation * with ENOKEY. This includes ->create(), ->mkdir(), ->mknod(), ->symlink(), * ->rename(), and ->link(). (However, ->rename() and ->link() are already * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().) * * This is necessary because creating a filename requires the directory's * encryption key, but just checking for the key on the directory inode during * the final filesystem operation doesn't guarantee that the key was available * during the preceding dentry lookup. And the key must have already been * available during the dentry lookup in order for it to have been checked * whether the filename already exists in the directory and for the new file's * dentry not to be invalidated due to it incorrectly having the no-key flag. * * Return: %true if the dentry is a no-key name */ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return dentry->d_flags & DCACHE_NOKEY_NAME; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { /* * This code tries to only take ->d_lock when necessary to write * to ->d_flags. We shouldn't be peeking on d_flags for * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case * there is a race, the worst it can happen is that we fail to * unset DCACHE_OP_REVALIDATE and pay the cost of an extra * d_revalidate. */ if (is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); } else if (dentry->d_flags & DCACHE_OP_REVALIDATE && dentry->d_op->d_revalidate == fscrypt_d_revalidate) { /* * Unencrypted dentries and encrypted dentries where the * key is available are always valid from fscrypt * perspective. Avoid the cost of calling * fscrypt_d_revalidate unnecessarily. */ spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_OP_REVALIDATE; spin_unlock(&dentry->d_lock); } } /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags); int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags); int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs); int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num); static inline bool fscrypt_is_bounce_page(struct page *page) { return page->mapping == NULL; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { return (struct page *)page_private(bounce_page); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return folio->mapping == NULL; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { return bounce_folio->private; } void fscrypt_free_bounce_page(struct page *bounce_page); /* policy.c */ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg); int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { const union fscrypt_policy *policy; }; int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy); bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2); void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb); static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return dummy_policy->policy != NULL; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { kfree(dummy_policy->policy); dummy_policy->policy = NULL; } /* keyring.c */ void fscrypt_destroy_keyring(struct super_block *sb); int fscrypt_ioctl_add_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg); int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg); int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg); /* keysetup.c */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret); void fscrypt_put_encryption_info(struct inode *inode); void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen); bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); static inline void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); } int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str); void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str); int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname); bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry); int __fscrypt_prepare_readdir(struct inode *dir); int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags); int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link); int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link); const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done); int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat); static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { sb->s_cop = s_cop; } #else /* !CONFIG_FS_ENCRYPTION */ static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { return NULL; } static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) { return false; } static inline void fscrypt_handle_d_move(struct dentry *dentry) { } static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) { return false; } static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { } /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { } static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, unsigned int len, unsigned int offs, gfp_t gfp_flags) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { return -EOPNOTSUPP; } static inline int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { return -EOPNOTSUPP; } static inline bool fscrypt_is_bounce_page(struct page *page) { return false; } static inline struct page *fscrypt_pagecache_page(struct page *bounce_page) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline bool fscrypt_is_bounce_folio(struct folio *folio) { return false; } static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); } static inline void fscrypt_free_bounce_page(struct page *bounce_page) { } /* policy.c */ static inline int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { return 0; } static inline int fscrypt_set_context(struct inode *inode, void *fs_data) { return -EOPNOTSUPP; } struct fscrypt_dummy_policy { }; static inline int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param, struct fscrypt_dummy_policy *dummy_policy) { return -EINVAL; } static inline bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1, const struct fscrypt_dummy_policy *p2) { return true; } static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, struct super_block *sb) { } static inline bool fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy) { return false; } static inline void fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy) { } /* keyring.c */ static inline void fscrypt_destroy_keyring(struct super_block *sb) { } static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } static inline int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* keysetup.c */ static inline int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; return 0; } static inline void fscrypt_put_encryption_info(struct inode *inode) { return; } static inline void fscrypt_free_inode(struct inode *inode) { } static inline int fscrypt_drop_inode(struct inode *inode) { return 0; } /* fname.c */ static inline int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; memset(fname, 0, sizeof(*fname)); fname->usr_fname = iname; fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } static inline void fscrypt_free_filename(struct fscrypt_name *fname) { return; } static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { return -EOPNOTSUPP; } static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { return; } static inline int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { return -EOPNOTSUPP; } static inline bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { /* Encryption support disabled; use standard comparison */ if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); } static inline u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { WARN_ON_ONCE(1); return 0; } static inline int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { return 1; } /* bio.c */ static inline bool fscrypt_decrypt_bio(struct bio *bio) { return true; } static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; } /* hooks.c */ static inline int fscrypt_file_open(struct inode *inode, struct file *filp) { if (IS_ENCRYPTED(inode)) return -EOPNOTSUPP; return 0; } static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_readdir(struct inode *dir) { return -EOPNOTSUPP; } static inline int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } static inline int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { return 0; } static inline int fscrypt_prepare_symlink(struct inode *dir, const char *target, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(dir)) return -EOPNOTSUPP; disk_link->name = (unsigned char *)target; disk_link->len = len + 1; if (disk_link->len > max_len) return -ENAMETOOLONG; return 0; } static inline int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { return -EOPNOTSUPP; } static inline const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, unsigned int max_size, struct delayed_call *done) { return ERR_PTR(-EOPNOTSUPP); } static inline int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) { return -EOPNOTSUPP; } static inline void fscrypt_set_ops(struct super_block *sb, const struct fscrypt_operations *s_cop) { } #endif /* !CONFIG_FS_ENCRYPTION */ /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh); bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return false; } static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } static inline void fscrypt_set_bio_crypt_ctx_bh( struct bio *bio, const struct buffer_head *first_bh, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { return true; } static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, const struct buffer_head *next_bh) { return true; } static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); } static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { return nr_blocks; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /** * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the block layer via blk-crypto rather * than in the filesystem layer. */ static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && __fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer * encryption * @inode: an inode. If encrypted, its key must be set up. * * Return: true if the inode requires file contents encryption and if the * encryption should be done in the filesystem layer rather than in the * block layer via blk-crypto. */ static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode) { return fscrypt_needs_contents_encryption(inode) && !__fscrypt_inode_uses_inline_crypto(inode); } /** * fscrypt_has_encryption_key() - check whether an inode has had its key set up * @inode: the inode to check * * Return: %true if the inode has had its encryption key set up, else %false. * * Usually this should be preceded by fscrypt_get_encryption_info() to try to * set up the key first. */ static inline bool fscrypt_has_encryption_key(const struct inode *inode) { return fscrypt_get_inode_info(inode) != NULL; } /** * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted * directory * @old_dentry: an existing dentry for the inode being linked * @dir: the target directory * @dentry: negative dentry for the target filename * * A new link can only be added to an encrypted directory if the directory's * encryption key is available --- since otherwise we'd have no way to encrypt * the filename. * * We also verify that the link will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if the directory's encryption key is missing, * -EXDEV if the link would result in an inconsistent encryption policy, or * another -errno code. */ static inline int fscrypt_prepare_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry); return 0; } /** * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted * directories * @old_dir: source directory * @old_dentry: dentry for source file * @new_dir: target directory * @new_dentry: dentry for target location (may be negative unless exchanging) * @flags: rename flags (we care at least about %RENAME_EXCHANGE) * * Prepare for ->rename() where the source and/or target directories may be * encrypted. A new link can only be added to an encrypted directory if the * directory's encryption key is available --- since otherwise we'd have no way * to encrypt the filename. A rename to an existing name, on the other hand, * *is* cryptographically possible without the key. However, we take the more * conservative approach and just forbid all no-key renames. * * We also verify that the rename will not violate the constraint that all files * in an encrypted directory tree use the same encryption policy. * * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the * rename would cause inconsistent encryption policies, or another -errno code. */ static inline int fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir)) return __fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, flags); return 0; } /** * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted * directory * @dir: directory being searched * @dentry: filename being looked up * @fname: (output) the name to use to search the on-disk directory * * Prepare for ->lookup() in a directory which may be encrypted by determining * the name that will actually be used to search the directory on-disk. If the * directory's encryption policy is supported by this kernel and its encryption * key is available, then the lookup is assumed to be by plaintext name; * otherwise, it is assumed to be by no-key name. * * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key * name. In this case the filesystem must assign the dentry a dentry_operations * which contains fscrypt_d_revalidate (or contains a d_revalidate method that * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the * directory's encryption key is later added. * * Return: 0 on success; -ENOENT if the directory's key is unavailable but the * filename isn't a valid no-key name, so a negative dentry should be created; * or another -errno code. */ static inline int fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, struct fscrypt_name *fname) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_lookup(dir, dentry, fname); memset(fname, 0, sizeof(*fname)); fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; fscrypt_prepare_dentry(dentry, false); return 0; } /** * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory * @dir: the directory inode * * If the directory is encrypted and it doesn't already have its encryption key * set up, try to set it up so that the filenames will be listed in plaintext * form rather than in no-key form. * * Return: 0 on success; -errno on error. Note that the encryption key being * unavailable is not considered an error. It is also not an error if * the encryption policy is unsupported by this kernel; that is treated * like the key being unavailable, so that files can still be deleted. */ static inline int fscrypt_prepare_readdir(struct inode *dir) { if (IS_ENCRYPTED(dir)) return __fscrypt_prepare_readdir(dir); return 0; } /** * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's * attributes * @dentry: dentry through which the inode is being changed * @attr: attributes to change * * Prepare for ->setattr() on a possibly-encrypted inode. On an encrypted file, * most attribute changes are allowed even without the encryption key. However, * without the encryption key we do have to forbid truncates. This is needed * because the size being truncated to may not be a multiple of the filesystem * block size, and in that case we'd have to decrypt the final block, zero the * portion past i_size, and re-encrypt it. (We *could* allow truncating to a * filesystem block boundary, but it's simpler to just forbid all truncates --- * and we already forbid all other contents modifications without the key.) * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (IS_ENCRYPTED(d_inode(dentry))) return __fscrypt_prepare_setattr(dentry, attr); return 0; } /** * fscrypt_encrypt_symlink() - encrypt the symlink target if needed * @inode: symlink inode * @target: plaintext symlink target * @len: length of @target excluding null terminator * @disk_link: (in/out) the on-disk symlink target being prepared * * If the symlink target needs to be encrypted, then this function encrypts it * into @disk_link->name. fscrypt_prepare_symlink() must have been called * previously to compute @disk_link->len. If the filesystem did not allocate a * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one * will be kmalloc()'ed and the filesystem will be responsible for freeing it. * * Return: 0 on success, -errno on failure */ static inline int fscrypt_encrypt_symlink(struct inode *inode, const char *target, unsigned int len, struct fscrypt_str *disk_link) { if (IS_ENCRYPTED(inode)) return __fscrypt_encrypt_symlink(inode, target, len, disk_link); return 0; } /* If *pagep is a bounce page, free it and set *pagep to the pagecache page */ static inline void fscrypt_finalize_bounce_page(struct page **pagep) { struct page *page = *pagep; if (fscrypt_is_bounce_page(page)) { *pagep = fscrypt_pagecache_page(page); fscrypt_free_bounce_page(page); } } #endif /* _LINUX_FSCRYPT_H */ |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* user-type.h: User-defined key type * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _KEYS_USER_TYPE_H #define _KEYS_USER_TYPE_H #include <linux/key.h> #include <linux/rcupdate.h> #ifdef CONFIG_KEYS /*****************************************************************************/ /* * the payload for a key of type "user" or "logon" * - once filled in and attached to a key: * - the payload struct is invariant may not be changed, only replaced * - the payload must be read with RCU procedures or with the key semaphore * held * - the payload may only be replaced with the key semaphore write-locked * - the key's data length is the size of the actual data, not including the * payload wrapper */ struct user_key_payload { struct rcu_head rcu; /* RCU destructor */ unsigned short datalen; /* length of this data */ char data[] __aligned(__alignof__(u64)); /* actual data */ }; extern struct key_type key_type_user; extern struct key_type key_type_logon; struct key_preparsed_payload; extern int user_preparse(struct key_preparsed_payload *prep); extern void user_free_preparse(struct key_preparsed_payload *prep); extern int user_update(struct key *key, struct key_preparsed_payload *prep); extern void user_revoke(struct key *key); extern void user_destroy(struct key *key); extern void user_describe(const struct key *user, struct seq_file *m); extern long user_read(const struct key *key, char *buffer, size_t buflen); static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key) { return (struct user_key_payload *)dereference_key_rcu(key); } static inline struct user_key_payload *user_key_payload_locked(const struct key *key) { return (struct user_key_payload *)dereference_key_locked((struct key *)key); } #endif /* CONFIG_KEYS */ #endif /* _KEYS_USER_TYPE_H */ |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:net,iface type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 nomatch flag support added */ /* 2 /0 support added */ /* 3 Counters support added */ /* 4 Comments support added */ /* 5 Forceadd support added */ /* 6 skbinfo support added */ /* 7 interface wildcard support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:net,iface", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,iface"); /* Type specific function prefix */ #define HTYPE hash_netiface #define IP_SET_HASH_WITH_NETS #define IP_SET_HASH_WITH_MULTI #define IP_SET_HASH_WITH_NET0 #define STRSCPY(a, b) strscpy(a, b, IFNAMSIZ) /* IPv4 variant */ struct hash_netiface4_elem_hashed { __be32 ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; }; /* Member elements */ struct hash_netiface4_elem { __be32 ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ static bool hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1, const struct hash_netiface4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && (ip1->wildcard ? strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : strcmp(ip1->iface, ip2->iface) == 0); } static int hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr) { elem->ip &= ip_set_netmask(cidr); elem->cidr = cidr; } static bool hash_netiface4_data_list(struct sk_buff *skb, const struct hash_netiface4_elem *data) { u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netiface4_data_next(struct hash_netiface4_elem *next, const struct hash_netiface4_elem *d) { next->ip = d->ip; } #define MTYPE hash_netiface4 #define HOST_MASK 32 #define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed) #include "ip_set_hash_gen.h" #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static const char *get_physindev_name(const struct sk_buff *skb, struct net *net) { struct net_device *dev = nf_bridge_get_physindev(skb, net); return dev ? dev->name : NULL; } static const char *get_physoutdev_name(const struct sk_buff *skb) { struct net_device *dev = nf_bridge_get_physoutdev(skb); return dev ? dev->name : NULL; } #endif static int hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct hash_netiface4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); e.ip &= ip_set_netmask(e.cidr); #define IFACE(dir) (par->state->dir ? par->state->dir->name : "") #define SRCDIR (opt->flags & IPSET_DIM_TWO_SRC) if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_netiface4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) e.wildcard = 1; } if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) { e.ip = htonl(ip & ip_set_hostmask(e.cidr)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip_to < ip) swap(ip, ip_to); if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } if (retried) ip = ntohl(h->next.ip); do { i++; e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_netiface4_data_next(&h->next, &e); return -ERANGE; } ip = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_netiface6_elem_hashed { union nf_inet_addr ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; }; struct hash_netiface6_elem { union nf_inet_addr ip; u8 physdev; u8 cidr; u8 nomatch; u8 elem; u8 wildcard; char iface[IFNAMSIZ]; }; /* Common functions */ static bool hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1, const struct hash_netiface6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->cidr == ip2->cidr && (++*multi) && ip1->physdev == ip2->physdev && (ip1->wildcard ? strncmp(ip1->iface, ip2->iface, strlen(ip1->iface)) == 0 : strcmp(ip1->iface, ip2->iface) == 0); } static int hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags) { elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH; } static void hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip, cidr); elem->cidr = cidr; } static bool hash_netiface6_data_list(struct sk_buff *skb, const struct hash_netiface6_elem *data) { u32 flags = (data->physdev ? IPSET_FLAG_PHYSDEV : 0) | (data->wildcard ? IPSET_FLAG_IFACE_WILDCARD : 0); if (data->nomatch) flags |= IPSET_FLAG_NOMATCH; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) || nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netiface6_data_next(struct hash_netiface6_elem *next, const struct hash_netiface6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_netiface6 #define HOST_MASK 128 #define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed) #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct hash_netiface6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), .elem = 1, }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6_netmask(&e.ip, e.cidr); if (opt->cmdflags & IPSET_FLAG_PHYSDEV) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) : get_physoutdev_name(skb); if (!eiface) return -EINVAL; STRSCPY(e.iface, eiface); e.physdev = 1; #endif } else { STRSCPY(e.iface, SRCDIR ? IFACE(in) : IFACE(out)); } if (strlen(e.iface) == 0) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IFACE] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ip6_netmask(&e.ip, e.cidr); nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_PHYSDEV) e.physdev = 1; if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); if (cadt_flags & IPSET_FLAG_IFACE_WILDCARD) e.wildcard = 1; } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } static struct ip_set_type hash_netiface_type __read_mostly = { .name = "hash:net,iface", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_IFACE | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netiface_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IFACE] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_netiface_init(void) { return ip_set_type_register(&hash_netiface_type); } static void __exit hash_netiface_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_netiface_type); } module_init(hash_netiface_init); module_exit(hash_netiface_fini); |
145 3 3 3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 | // SPDX-License-Identifier: GPL-2.0-only /* * mm/readahead.c - address_space-level file readahead. * * Copyright (C) 2002, Linus Torvalds * * 09Apr2002 Andrew Morton * Initial version. */ /** * DOC: Readahead Overview * * Readahead is used to read content into the page cache before it is * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in * the page cache, or that it is in the page cache and has the * readahead flag set. This flag indicates that the folio was read * as part of a previous readahead request and now that it has been * accessed, it is time for the next readahead. * * Each readahead request is partly synchronous read, and partly async * readahead. This is reflected in the struct file_ra_state which * contains ->size being the total number of pages, and ->async_size * which is the number of pages in the async section. The readahead * flag will be set on the first folio in this async section to trigger * a subsequent readahead. Once a series of sequential reads has been * established, there should be no need for a synchronous component and * all readahead request will be fully asynchronous. * * When either of the triggers causes a readahead, three numbers need * to be determined: the start of the region to read, the size of the * region, and the size of the async tail. * * The start of the region is simply the first page address at or after * the accessed address, which is not currently populated in the page * cache. This is found with a simple search in the page cache. * * The size of the async tail is determined by subtracting the size that * was explicitly requested from the determined request size, unless * this would be less than zero - then zero is used. NOTE THIS * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. * * The size of the region is normally determined from the size of the * previous readahead which loaded the preceding pages. This may be * discovered from the struct file_ra_state for simple sequential reads, * or from examining the state of the page cache when multiple * sequential reads are interleaved. Specifically: where the readahead * was triggered by the readahead flag, the size of the previous * readahead is assumed to be the number of pages from the triggering * page to the start of the new readahead. In these cases, the size of * the previous readahead is scaled, often doubled, for the new * readahead, though see get_next_ra_size() for details. * * If the size of the previous read cannot be determined, the number of * preceding pages in the page cache is used to estimate the size of * a previous read. This estimate could easily be misled by random * reads being coincidentally adjacent, so it is ignored unless it is * larger than the current request, and it is not scaled up, unless it * is at the start of file. * * In general readahead is accelerated at the start of the file, as * reads from there are often sequential. There are other minor * adjustments to the readahead size in various special cases and these * are best discovered by reading the code. * * The above calculation, based on the previous readahead size, * determines the size of the readahead, to which any requested read * size may be added. * * Readahead requests are sent to the filesystem using the ->readahead() * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * * ->readahead() will generally call readahead_folio() repeatedly to get * each folio from those prepared for readahead. It may fail to read a * folio by: * * * not calling readahead_folio() sufficiently many times, effectively * ignoring some folios, as might be appropriate if the path to * storage is congested. * * * failing to actually submit a read request for a given folio, * possibly due to insufficient resources, or * * * getting an error during subsequent processing of a request. * * In the last two cases, the folio should be unlocked by the filesystem * to indicate that the read attempt has failed. In the first case the * folio will be unlocked by the VFS. * * Those folios not in the final ``async_size`` of the request should be * considered to be important and ->readahead() should not fail them due * to congestion or temporary resource unavailability, but should wait * for necessary resources (e.g. memory or indexing information) to * become available. Folios in the final ``async_size`` may be * considered less urgent and failure to read them is more acceptable. * In this case it is best to use filemap_remove_folio() to remove the * folios from the page cache as is automatically done for folios that * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using * ->read_folio() which may be less efficient. */ #include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/export.h> #include <linux/backing-dev.h> #include <linux/task_io_accounting_ops.h> #include <linux/pagemap.h> #include <linux/psi.h> #include <linux/syscalls.h> #include <linux/file.h> #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> #include <linux/sched/mm.h> #include "internal.h" /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. */ void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) { ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; ra->prev_pos = -1; } EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) return; if (unlikely(rac->_workingset)) psi_memstall_enter(&rac->_pflags); blk_start_plug(&plug); if (aops->readahead) { aops->readahead(rac); /* * Clean up the remaining folios. The sizes in ->ra * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ while ((folio = readahead_folio(rac)) != NULL) { unsigned long nr = folio_nr_pages(folio); folio_get(folio); rac->ra->size -= nr; if (rac->ra->async_size >= nr) { rac->ra->async_size -= nr; filemap_remove_folio(folio); } folio_unlock(folio); folio_put(folio); } } else { while ((folio = readahead_folio(rac)) != NULL) aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); if (unlikely(rac->_workingset)) psi_memstall_leave(&rac->_pflags); rac->_workingset = false; BUG_ON(readahead_count(rac)); } /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. * @nr_to_read: The number of pages to read. * @lookahead_size: Where to start the next readahead. * * This function is for filesystems to call when they want to start * readahead beyond a file's stated i_size. This is almost certainly * not the function you want to call. Use page_cache_async_readahead() * or page_cache_sync_readahead() instead. * * Context: File is referenced by caller. Mutexes may be held by caller. * May sleep, but will not reenter filesystem to reclaim memory. */ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted * them for I/O. Adding another page may need to allocate memory, * which can trigger memory reclaim. Telling the VM we're in * the middle of a filesystem operation will cause it to not * touch file-backed pages, preventing a deadlock. Most (all?) * filesystems already specify __GFP_NOFS in their mapping's * gfp_mask, but let's be explicit here. */ unsigned int nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); /* * As iterator `i` is aligned to min_nrpages, round_up the * difference between nr_to_read and lookahead_size to mark the * index that only has lookahead or "async_region" to set the * readahead flag. */ if (lookahead_size <= nr_to_read) { unsigned long ra_folio_index; ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, min_nrpages); mark = ra_folio_index - index; } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; /* * Preallocate as many pages as we will need. */ while (i < nr_to_read) { struct folio *folio = xa_load(&mapping->i_pages, index + i); int ret; if (folio && !xa_is_value(folio)) { /* * Page already present? Kick off the current batch * of contiguous pages before continuing with the * next batch. This page may be the one we would * have intended to mark as Readahead, but we don't * have a stable reference to this page, and it's * not worth getting one just for that. */ read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } folio = filemap_alloc_folio(gfp_mask, mapping_min_folio_order(mapping)); if (!folio) break; ret = filemap_add_folio(mapping, folio, index + i, gfp_mask); if (ret < 0) { folio_put(folio); if (ret == -ENOMEM) break; read_pages(ractl); ractl->_index += min_nrpages; i = ractl->_index + ractl->_nr_pages - index; continue; } if (i == mark) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); ractl->_nr_pages += min_nrpages; i += min_nrpages; } /* * Now start the IO. We ignore I/O errors - if the folio is not * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); /* * do_page_cache_ra() actually reads a chunk of disk. It allocates * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. */ static void do_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = ractl->mapping->host; unsigned long index = readahead_index(ractl); loff_t isize = i_size_read(inode); pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; end_index = (isize - 1) >> PAGE_SHIFT; if (index > end_index) return; /* Don't read past the page containing the last byte of the file */ if (nr_to_read > end_index - index) nr_to_read = end_index - index + 1; page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ void force_page_cache_ra(struct readahead_control *ractl, unsigned long nr_to_read) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages; if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min_t(unsigned long, nr_to_read, max_pages); while (nr_to_read) { unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; do_page_cache_ra(ractl, this_chunk, 0); nr_to_read -= this_chunk; } } /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large * for 128k (32 page) max ra * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial */ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) { unsigned long newsize = roundup_pow_of_two(size); if (newsize <= max / 32) newsize = newsize * 4; else if (newsize <= max / 4) newsize = newsize * 2; else newsize = max; return newsize; } /* * Get the previous window size, ramp it up, and * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, unsigned long max) { unsigned long cur = ra->size; if (cur < max / 16) return 4 * cur; if (cur <= max / 2) return 2 * cur; return max; } /* * On-demand readahead design. * * The fields in struct file_ra_state represent the most-recently-executed * readahead attempt: * * |<----- async_size ---------| * |------------------- size -------------------->| * |==================#===========================| * ^start ^page marked with PG_readahead * * To overlap application thinking time and disk I/O time, we do * `readahead pipelining': Do not wait until the application consumed all * readahead pages and stalled on the missing page at readahead_index; * Instead, submit an asynchronous readahead I/O as soon as there are * only async_size pages left in the readahead window. Normally async_size * will be equal to size, for maximum pipelining. * * In interleaved sequential reads, concurrent streams on the same fd can * be invalidating each other's readahead state. So we flag the new readahead * page at (start+size-async_size) with PG_readahead, and use it as readahead * indicator. The flag won't be set on already cached pages, to avoid the * readahead-for-nothing fuss, saving pointless page cache lookups. * * prev_pos tracks the last visited byte in the _previous_ read request. * It should be maintained by the caller, and will be used for detecting * small random reads. Note that the readahead algorithm checks loosely * for sequential patterns. Hence interleaved reads might be served as * sequential ones. * * There is a special-case: if the first page which the application tries to * read happens to be the first page of the file, it is assumed that a linear * read is about to happen and the window is immediately set to the initial size * based on I/O request size and the max_readahead. * * The code ramps up the readahead size aggressively at first, but slow down as * it approaches max_readhead. */ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; struct folio *folio = filemap_alloc_folio(gfp, order); if (!folio) return -ENOMEM; mark = round_down(mark, 1UL << order); if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) { folio_put(folio); return err; } ractl->_nr_pages += 1UL << order; ractl->_workingset |= folio_test_workingset(folio); return 0; } void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; pgoff_t index = readahead_index(ractl); unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; unsigned int nofs; int err = 0; gfp_t gfp = readahead_gfp_mask(mapping); unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); /* * Fallback when size < min_nrpages as each folio should be * at least min_nrpages anyway. */ if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) goto fallback; limit = min(limit, index + ra->size - 1); if (new_order < mapping_max_folio_order(mapping)) new_order += 2; new_order = min(mapping_max_folio_order(mapping), new_order); new_order = min_t(unsigned int, new_order, ilog2(ra->size)); new_order = max(new_order, min_order); /* See comment in page_cache_ra_unbounded() */ nofs = memalloc_nofs_save(); filemap_invalidate_lock_shared(mapping); /* * If the new_order is greater than min_order and index is * already aligned to new_order, then this will be noop as index * aligned to new_order should also be aligned to min_order. */ ractl->_index = mapping_align_index(mapping, index); index = readahead_index(ractl); while (index <= limit) { unsigned int order = new_order; /* Align with smaller pages if needed */ if (index & ((1UL << order) - 1)) order = __ffs(index); /* Don't allocate pages past EOF */ while (order > min_order && index + (1UL << order) - 1 > limit) order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; index += 1UL << order; } read_pages(ractl); filemap_invalidate_unlock_shared(mapping); memalloc_nofs_restore(nofs); /* * If there were already pages in the page cache, then we may have * left some gaps. Let the regular readahead code take care of this * situation. */ if (!err) return; fallback: do_page_cache_ra(ractl, ra->size, ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); unsigned long max_pages = ractl->ra->ra_pages; /* * If the request exceeds the readahead window, allow the read to * be up to the optimal hardware IO size */ if (req_size > max_pages && bdi->io_pages > max_pages) max_pages = min(req_size, bdi->io_pages); return max_pages; } void page_cache_sync_ra(struct readahead_control *ractl, unsigned long req_count) { pgoff_t index = readahead_index(ractl); bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); struct file_ra_state *ra = ractl->ra; unsigned long max_pages, contig_count; pgoff_t prev_index, miss; /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced * readahead will do the right thing and limit the read to just the * requested range, which we'll set to 1 page for this case. */ if (!ra->ra_pages || blk_cgroup_congested()) { if (!ractl->file) return; req_count = 1; do_forced_ra = true; } /* be dumb */ if (do_forced_ra) { force_page_cache_ra(ractl, req_count); return; } max_pages = ractl_max_pages(ractl, req_count); prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; /* * A start of file, oversized read, or sequential cache miss: * trivial case: (index - prev_index) == 1 * unaligned reads: (index - prev_index) == 0 */ if (!index || req_count > max_pages || index - prev_index <= 1UL) { ra->start = index; ra->size = get_init_ra_size(req_count, max_pages); ra->async_size = ra->size > req_count ? ra->size - req_count : ra->size >> 1; goto readit; } /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ rcu_read_lock(); miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages); rcu_read_unlock(); contig_count = index - miss - 1; /* * Standalone, small random read. Read as is, and do not pollute the * readahead state. */ if (contig_count <= req_count) { do_page_cache_ra(ractl, req_count, 0); return; } /* * File cached from the beginning: * it is a strong indication of long-run stream (or whole-file-read) */ if (miss == ULONG_MAX) contig_count *= 2; ra->start = index; ra->size = min(contig_count + req_count, max_pages); ra->async_size = 1; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, 0); } EXPORT_SYMBOL_GPL(page_cache_sync_ra); void page_cache_async_ra(struct readahead_control *ractl, struct folio *folio, unsigned long req_count) { unsigned long max_pages; struct file_ra_state *ra = ractl->ra; pgoff_t index = readahead_index(ractl); pgoff_t expected, start; unsigned int order = folio_order(folio); /* no readahead */ if (!ra->ra_pages) return; /* * Same bit is used for PG_readahead and PG_reclaim. */ if (folio_test_writeback(folio)) return; folio_clear_readahead(folio); if (blk_cgroup_congested()) return; max_pages = ractl_max_pages(ractl, req_count); /* * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ expected = round_down(ra->start + ra->size - ra->async_size, 1UL << order); if (index == expected) { ra->start += ra->size; /* * In the case of MADV_HUGEPAGE, the actual size might exceed * the readahead window. */ ra->size = max(ra->size, get_next_ra_size(ra, max_pages)); ra->async_size = ra->size; goto readit; } /* * Hit a marked folio without valid readahead state. * E.g. interleaved reads. * Query the pagecache for async_size, which normally equals to * readahead size. Ramp it up and use it as the new readahead size. */ rcu_read_lock(); start = page_cache_next_miss(ractl->mapping, index + 1, max_pages); rcu_read_unlock(); if (!start || start - index > max_pages) return; ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_count; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; readit: ractl->_index = ra->start; page_cache_ra_order(ractl, ra, order); } EXPORT_SYMBOL_GPL(page_cache_async_ra); ssize_t ksys_readahead(int fd, loff_t offset, size_t count) { CLASS(fd, f)(fd); if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ)) return -EBADF; /* * The readahead() syscall is intended to run only on files * that can execute readahead. If readahead is not possible * on this file, then we must return -EINVAL. */ if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops || (!S_ISREG(file_inode(fd_file(f))->i_mode) && !S_ISBLK(file_inode(fd_file(f))->i_mode))) return -EINVAL; return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED); } SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) { return ksys_readahead(fd, offset, count); } #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) { return ksys_readahead(fd, compat_arg_u64_glue(offset), count); } #endif /** * readahead_expand - Expand a readahead request * @ractl: The request to be expanded * @new_start: The revised start * @new_len: The revised size of the request * * Attempt to expand a readahead request outwards from the current size to the * specified size by inserting locked pages before and after the current window * to increase the size to the new window. This may involve the insertion of * THPs, in which case the window may get expanded even beyond what was * requested. * * The algorithm will stop if it encounters a conflicting page already in the * pagecache and leave a smaller expansion than requested. * * The caller must check for this by examining the revised @ractl object for a * different expansion than was requested. */ void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len) { struct address_space *mapping = ractl->mapping; struct file_ra_state *ra = ractl->ra; pgoff_t new_index, new_nr_pages; gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); unsigned int min_order = mapping_min_folio_order(mapping); new_index = new_start / PAGE_SIZE; /* * Readahead code should have aligned the ractl->_index to * min_nrpages before calling readahead aops. */ VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); /* Expand the leading edge downwards */ while (ractl->_index > new_index) { unsigned long index = ractl->_index - 1; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; ractl->_index = folio->index; } new_len += new_start - readahead_pos(ractl); new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); /* Expand the trailing edge upwards */ while (ractl->_nr_pages < new_nr_pages) { unsigned long index = ractl->_index + ractl->_nr_pages; struct folio *folio = xa_load(&mapping->i_pages, index); if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ folio = filemap_alloc_folio(gfp_mask, min_order); if (!folio) return; index = mapping_align_index(mapping, index); if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { folio_put(folio); return; } if (unlikely(folio_test_workingset(folio)) && !ractl->_workingset) { ractl->_workingset = true; psi_memstall_enter(&ractl->_pflags); } ractl->_nr_pages += min_nrpages; if (ra) { ra->size += min_nrpages; ra->async_size += min_nrpages; } } } EXPORT_SYMBOL(readahead_expand); |
177 177 166 165 76 10 133 163 148 46 76 141 143 124 10 142 165 165 110 110 111 57 57 57 12 13 174 13 17 160 3 172 25 7 172 173 11 174 27 28 3 173 175 176 174 37 172 172 20 174 8 8 8 8 8 8 8 141 141 17 30 15 140 140 3 141 32 138 141 3 8 8 8 140 7 81 98 174 171 1 173 173 141 140 141 141 1 1 1 46 123 140 23 140 24 172 87 1 87 12 36 34 19 2 14 3 172 157 68 14 21 20 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions handle output processing. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/time.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/init.h> #include <linux/slab.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/icmp.h> #include <net/net_namespace.h> #include <linux/socket.h> /* for sa_family_t */ #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> /* Forward declarations for private helpers. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk); static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len); static void sctp_packet_reset(struct sctp_packet *packet) { /* sctp_packet_transmit() relies on this to reset size to the * current overhead after sending packets. */ packet->size = packet->overhead; packet->has_cookie_echo = 0; packet->has_sack = 0; packet->has_data = 0; packet->has_auth = 0; packet->ipfragok = 0; packet->auth = NULL; } /* Config a packet. * This appears to be a followup set of initializations. */ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_sock *sp = NULL; struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; /* do the following jobs only once for a flush schedule */ if (!sctp_packet_empty(packet)) return; /* set packet max_size with pathmtu, then calculate overhead */ packet->max_size = tp->pathmtu; if (asoc) { sk = asoc->base.sk; sp = sctp_sk(sk); } packet->overhead = sctp_mtu_payload(sp, 0, 0); packet->size = packet->overhead; if (!asoc) return; /* update dst or transport pathmtu if in need */ if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sp); if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pl_enabled(tp) && asoc->param_flags & SPP_PMTUD_ENABLE) { if (!sctp_transport_pmtu_check(tp)) sctp_assoc_sync_pmtu(asoc); } if (asoc->pmtu_pending) { if (asoc->param_flags & SPP_PMTUD_ENABLE) sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ if (ecn_capable) { struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } if (!tp->dst) return; /* set packet max_size with gso_max_size if gso is enabled*/ rcu_read_lock(); if (__sk_dst_get(sk) != tp->dst) { dst_hold(tp->dst); sk_setup_caps(sk, tp->dst); } packet->max_size = sk_can_gso(sk) ? min(READ_ONCE(tp->dst->dev->gso_max_size), GSO_LEGACY_MAX_SIZE) : asoc->pathmtu; rcu_read_unlock(); } /* Initialize the packet structure. */ void sctp_packet_init(struct sctp_packet *packet, struct sctp_transport *transport, __u16 sport, __u16 dport) { pr_debug("%s: packet:%p transport:%p\n", __func__, packet, transport); packet->transport = transport; packet->source_port = sport; packet->destination_port = dport; INIT_LIST_HEAD(&packet->chunk_list); /* The overhead will be calculated by sctp_packet_config() */ packet->overhead = 0; sctp_packet_reset(packet); packet->vtag = 0; } /* Free a packet. */ void sctp_packet_free(struct sctp_packet *packet) { struct sctp_chunk *chunk, *tmp; pr_debug("%s: packet:%p\n", __func__, packet); list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } } /* This routine tries to append the chunk to the offered packet. If adding * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk * is not present in the packet, it transmits the input packet. * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long * as it can fit in the packet, but any more data that does not fit in this * packet can be sent only after receiving the COOKIE_ACK. */ enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp) { enum sctp_xmit retval; pr_debug("%s: packet:%p size:%zu chunk:%p size:%d\n", __func__, packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: if (!packet->has_cookie_echo) { int error = 0; error = sctp_packet_transmit(packet, gfp); if (error < 0) chunk->skb->sk->sk_err = -error; /* If we have an empty packet, then we can NOT ever * return PMTU_FULL. */ if (!one_packet) retval = sctp_packet_append_chunk(packet, chunk); } break; case SCTP_XMIT_RWND_FULL: case SCTP_XMIT_OK: case SCTP_XMIT_DELAY: break; } return retval; } /* Try to bundle a pad chunk into a packet with a heartbeat chunk for PLPMTUTD probe */ static enum sctp_xmit sctp_packet_bundle_pad(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_transport *t = pkt->transport; struct sctp_chunk *pad; int overhead = 0; if (!chunk->pmtu_probe) return SCTP_XMIT_OK; /* calculate the Padding Data size for the pad chunk */ overhead += sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); overhead += sizeof(struct sctp_sender_hb_info) + sizeof(struct sctp_pad_chunk); pad = sctp_make_pad(t->asoc, t->pl.probe_size - overhead); if (!pad) return SCTP_XMIT_DELAY; list_add_tail(&pad->list, &pkt->chunk_list); pkt->size += SCTP_PAD4(ntohs(pad->chunk_hdr->length)); chunk->transport = t; return SCTP_XMIT_OK; } /* Try to bundle an auth chunk into the packet. */ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt, struct sctp_chunk *chunk) { struct sctp_association *asoc = pkt->transport->asoc; enum sctp_xmit retval = SCTP_XMIT_OK; struct sctp_chunk *auth; /* if we don't have an association, we can't do authentication */ if (!asoc) return retval; /* See if this is an auth chunk we are bundling or if * auth is already bundled. */ if (chunk->chunk_hdr->type == SCTP_CID_AUTH || pkt->has_auth) return retval; /* if the peer did not request this chunk to be authenticated, * don't do it */ if (!chunk->auth) return retval; auth = sctp_make_auth(asoc, chunk->shkey->key_id); if (!auth) return retval; auth->shkey = chunk->shkey; sctp_auth_shkey_hold(auth->shkey); retval = __sctp_packet_append_chunk(pkt, auth); if (retval != SCTP_XMIT_OK) sctp_chunk_free(auth); return retval; } /* Try to bundle a SACK with the packet. */ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; /* If sending DATA and haven't aleady bundled a SACK, try to * bundle one in to the packet. */ if (sctp_chunk_is_data(chunk) && !pkt->has_sack && !pkt->has_cookie_echo) { struct sctp_association *asoc; struct timer_list *timer; asoc = pkt->transport->asoc; timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; /* If the SACK timer is running, we have a pending SACK */ if (timer_pending(timer)) { struct sctp_chunk *sack; if (pkt->transport->sack_generation != pkt->transport->asoc->peer.sack_generation) return retval; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (sack) { retval = __sctp_packet_append_chunk(pkt, sack); if (retval != SCTP_XMIT_OK) { sctp_chunk_free(sack); goto out; } SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; if (del_timer(timer)) sctp_association_put(asoc); } } } out: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ static enum sctp_xmit __sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { __u16 chunk_len = SCTP_PAD4(ntohs(chunk->chunk_hdr->length)); enum sctp_xmit retval = SCTP_XMIT_OK; /* Check to see if this chunk will fit into the packet */ retval = sctp_packet_will_fit(packet, chunk, chunk_len); if (retval != SCTP_XMIT_OK) goto finish; /* We believe that this chunk is OK to add to the packet */ switch (chunk->chunk_hdr->type) { case SCTP_CID_DATA: case SCTP_CID_I_DATA: /* Account for the data being in the packet */ sctp_packet_append_data(packet, chunk); /* Disallow SACK bundling after DATA. */ packet->has_sack = 1; /* Disallow AUTH bundling after DATA */ packet->has_auth = 1; /* Let it be knows that packet has DATA in it */ packet->has_data = 1; /* timestamp the chunk for rtx purposes */ chunk->sent_at = jiffies; /* Mainly used for prsctp RTX policy */ chunk->sent_count++; break; case SCTP_CID_COOKIE_ECHO: packet->has_cookie_echo = 1; break; case SCTP_CID_SACK: packet->has_sack = 1; if (chunk->asoc) chunk->asoc->stats.osacks++; break; case SCTP_CID_AUTH: packet->has_auth = 1; packet->auth = chunk; break; } /* It is OK to send this chunk. */ list_add_tail(&chunk->list, &packet->chunk_list); packet->size += chunk_len; chunk->transport = packet->transport; finish: return retval; } /* Append a chunk to the offered packet reporting back any inability to do * so. */ enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk) { enum sctp_xmit retval = SCTP_XMIT_OK; pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); /* Data chunks are special. Before seeing what else we can * bundle into this packet, check to see if we are allowed to * send this DATA. */ if (sctp_chunk_is_data(chunk)) { retval = sctp_packet_can_append_data(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; } /* Try to bundle AUTH chunk */ retval = sctp_packet_bundle_auth(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; /* Try to bundle SACK chunk */ retval = sctp_packet_bundle_sack(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = __sctp_packet_append_chunk(packet, chunk); if (retval != SCTP_XMIT_OK) goto finish; retval = sctp_packet_bundle_pad(packet, chunk); finish: return retval; } static void sctp_packet_gso_append(struct sk_buff *head, struct sk_buff *skb) { if (SCTP_OUTPUT_CB(head)->last == head) skb_shinfo(head)->frag_list = skb; else SCTP_OUTPUT_CB(head)->last->next = skb; SCTP_OUTPUT_CB(head)->last = skb; head->truesize += skb->truesize; head->data_len += skb->len; head->len += skb->len; refcount_add(skb->truesize, &head->sk->sk_wmem_alloc); __skb_header_release(skb); } static int sctp_packet_pack(struct sctp_packet *packet, struct sk_buff *head, int gso, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_auth_chunk *auth = NULL; struct sctp_chunk *chunk, *tmp; int pkt_count = 0, pkt_size; struct sock *sk = head->sk; struct sk_buff *nskb; int auth_len = 0; if (gso) { skb_shinfo(head)->gso_type = sk->sk_gso_type; SCTP_OUTPUT_CB(head)->last = head; } else { nskb = head; pkt_size = packet->size; goto merge; } do { /* calculate the pkt_size and alloc nskb */ pkt_size = packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padded = SCTP_PAD4(chunk->skb->len); if (chunk == packet->auth) auth_len = padded; else if (auth_len + padded + packet->overhead > tp->pathmtu) return 0; else if (pkt_size + padded > tp->pathmtu) break; pkt_size += padded; } nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); if (!nskb) return 0; skb_reserve(nskb, packet->overhead + MAX_HEADER); merge: /* merge chunks into nskb and append nskb into head list */ pkt_size -= packet->overhead; list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { int padding; list_del_init(&chunk->list); if (sctp_chunk_is_data(chunk)) { if (!sctp_chunk_retransmitted(chunk) && !tp->rto_pending) { chunk->rtt_in_progress = 1; tp->rto_pending = 1; } } padding = SCTP_PAD4(chunk->skb->len) - chunk->skb->len; if (padding) skb_put_zero(chunk->skb, padding); if (chunk == packet->auth) auth = (struct sctp_auth_chunk *) skb_tail_pointer(nskb); skb_put_data(nskb, chunk->skb->data, chunk->skb->len); pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), chunk->has_tsn ? "TSN" : "No TSN", chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, ntohs(chunk->chunk_hdr->length), chunk->skb->len, chunk->rtt_in_progress); pkt_size -= SCTP_PAD4(chunk->skb->len); if (!sctp_chunk_is_data(chunk) && chunk != packet->auth) sctp_chunk_free(chunk); if (!pkt_size) break; } if (auth) { sctp_auth_calculate_hmac(tp->asoc, nskb, auth, packet->auth->shkey, gfp); /* free auth if no more chunks, or add it back */ if (list_empty(&packet->chunk_list)) sctp_chunk_free(packet->auth); else list_add(&packet->auth->list, &packet->chunk_list); } if (gso) sctp_packet_gso_append(head, nskb); pkt_count++; } while (!list_empty(&packet->chunk_list)); if (gso) { memset(head->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); skb_shinfo(head)->gso_segs = pkt_count; skb_shinfo(head)->gso_size = GSO_BY_FRAGS; goto chksum; } if (sctp_checksum_disable) return 1; if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) || dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) { struct sctphdr *sh = (struct sctphdr *)skb_transport_header(head); sh->checksum = sctp_compute_cksum(head, 0); } else { chksum: head->ip_summed = CHECKSUM_PARTIAL; head->csum_not_inet = 1; head->csum_start = skb_transport_header(head) - head->head; head->csum_offset = offsetof(struct sctphdr, checksum); } return pkt_count; } /* All packets are sent to the network through this function from * sctp_outq_tail(). * * The return value is always 0 for now. */ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; struct sk_buff *head; struct sctphdr *sh; struct sock *sk; pr_debug("%s: packet:%p\n", __func__, packet); if (list_empty(&packet->chunk_list)) return 0; chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ packet->ipfragok = 1; } else { if (!sk_can_gso(sk)) { /* check gso */ pr_err_once("Trying to GSO but underlying device doesn't support it."); goto out; } gso = 1; } } /* alloc head skb */ head = alloc_skb((gso ? packet->overhead : packet->size) + MAX_HEADER, gfp); if (!head) goto out; skb_reserve(head, packet->overhead + MAX_HEADER); skb_set_owner_w(head, sk); /* set sctp header */ sh = skb_push(head, sizeof(struct sctphdr)); skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); sh->vtag = htonl(packet->vtag); sh->checksum = 0; /* drop packet if no dst */ if (!tp->dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(head); goto out; } /* pack up chunks */ pkt_count = sctp_packet_pack(packet, head, gso, gfp); if (!pkt_count) { kfree_skb(head); goto out; } pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); /* start autoclose timer */ if (packet->has_data && sctp_state(asoc, ESTABLISHED) && asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]) { struct timer_list *timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; unsigned long timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); } /* sctp xmit */ tp->af_specific->ecn_capable(sk); if (asoc) { asoc->stats.opackets += pkt_count; if (asoc->peer.last_sent_to != tp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; if (tp->dst_pending_confirm) skb_set_dst_pending_confirm(head, 1); /* neighbour should be confirmed on successful transmission or * positive error */ if (tp->af_specific->sctp_xmit(head, tp) >= 0 && tp->dst_pending_confirm) tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { list_del_init(&chunk->list); if (!sctp_chunk_is_data(chunk)) sctp_chunk_free(chunk); } sctp_packet_reset(packet); return 0; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private function check to see if a chunk can be added */ static enum sctp_xmit sctp_packet_can_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { size_t datasize, rwnd, inflight, flight_size; struct sctp_transport *transport = packet->transport; struct sctp_association *asoc = transport->asoc; struct sctp_outq *q = &asoc->outqueue; /* RFC 2960 6.1 Transmission of DATA Chunks * * A) At any given time, the data sender MUST NOT transmit new data to * any destination transport address if its peer's rwnd indicates * that the peer has no buffer space (i.e. rwnd is 0, see Section * 6.2.1). However, regardless of the value of rwnd (including if it * is 0), the data sender can always have one DATA chunk in flight to * the receiver if allowed by cwnd (see rule B below). This rule * allows the sender to probe for a change in rwnd that the sender * missed due to the SACK having been lost in transit from the data * receiver to the data sender. */ rwnd = asoc->peer.rwnd; inflight = q->outstanding_bytes; flight_size = transport->flight_size; datasize = sctp_data_size(chunk); if (datasize > rwnd && inflight > 0) /* We have (at least) one data chunk in flight, * so we can't fall back to rule 6.1 B). */ return SCTP_XMIT_RWND_FULL; /* RFC 2960 6.1 Transmission of DATA Chunks * * B) At any given time, the sender MUST NOT transmit new data * to a given transport address if it has cwnd or more bytes * of data outstanding to that transport address. */ /* RFC 7.2.4 & the Implementers Guide 2.8. * * 3) ... * When a Fast Retransmit is being performed the sender SHOULD * ignore the value of cwnd and SHOULD NOT delay retransmission. */ if (chunk->fast_retransmit != SCTP_NEED_FRTX && flight_size >= transport->cwnd) return SCTP_XMIT_RWND_FULL; /* Nagle's algorithm to solve small-packet problem: * Inhibit the sending of new chunks when new outgoing data arrives * if any previously transmitted data on the connection remains * unacknowledged. */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; if (!sctp_packet_empty(packet)) /* Append to packet */ return SCTP_XMIT_OK; if (!sctp_state(asoc, ESTABLISHED)) return SCTP_XMIT_OK; /* Check whether this chunk and all the rest of pending data will fit * or delay in hopes of bundling a full sized packet. */ if (chunk->skb->len + q->out_qlen > transport->pathmtu - packet->overhead - sctp_datachk_len(&chunk->asoc->stream) - 4) /* Enough data queued to fill a packet */ return SCTP_XMIT_OK; /* Don't delay large message writes that may have been fragmented */ if (!chunk->msg->can_delay) return SCTP_XMIT_OK; /* Defer until all data acked or packet full */ return SCTP_XMIT_DELAY; } /* This private function does management things when adding DATA chunk */ static void sctp_packet_append_data(struct sctp_packet *packet, struct sctp_chunk *chunk) { struct sctp_transport *transport = packet->transport; size_t datasize = sctp_data_size(chunk); struct sctp_association *asoc = transport->asoc; u32 rwnd = asoc->peer.rwnd; /* Keep track of how many bytes are in flight over this transport. */ transport->flight_size += datasize; /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else rwnd = 0; asoc->peer.rwnd = rwnd; sctp_chunk_assign_tsn(chunk); asoc->stream.si->assign_number(chunk); } static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { enum sctp_xmit retval = SCTP_XMIT_OK; size_t psize, pmtu, maxsize; /* Don't bundle in this packet if this chunk's auth key doesn't * match other chunks already enqueued on this packet. Also, * don't bundle the chunk with auth key if other chunks in this * packet don't have auth key. */ if ((packet->auth && chunk->shkey != packet->auth->shkey) || (!packet->auth && chunk->shkey && chunk->chunk_hdr->type != SCTP_CID_AUTH)) return SCTP_XMIT_PMTU_FULL; psize = packet->size; if (packet->transport->asoc) pmtu = packet->transport->asoc->pathmtu; else pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ if (psize + chunk_len > pmtu) { /* It's OK to fragment at IP level if any one of the following * is true: * 1. The packet is empty (meaning this chunk is greater * the MTU) * 2. The packet doesn't have any data in it yet and data * requires authentication. */ if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; goto out; } /* Similarly, if this chunk was built before a PMTU * reduction, we have to fragment it at IP level now. So * if the packet already contains something, we need to * flush. */ maxsize = pmtu - packet->overhead; if (packet->auth) maxsize -= SCTP_PAD4(packet->auth->skb->len); if (chunk_len > maxsize) retval = SCTP_XMIT_PMTU_FULL; /* It is also okay to fragment if the chunk we are * adding is a control chunk, but only if current packet * is not a GSO one otherwise it causes fragmentation of * a large frame. So in this case we allow the * fragmentation by forcing it to be in a new packet. */ if (!sctp_chunk_is_data(chunk) && packet->has_data) retval = SCTP_XMIT_PMTU_FULL; if (psize + chunk_len > packet->max_size) /* Hit GSO/PMTU limit, gotta flush */ retval = SCTP_XMIT_PMTU_FULL; if (!packet->transport->burst_limited && psize + chunk_len > (packet->transport->cwnd >> 1)) /* Do not allow a single GSO packet to use more * than half of cwnd. */ retval = SCTP_XMIT_PMTU_FULL; if (packet->transport->burst_limited && psize + chunk_len > (packet->transport->burst_limited >> 1)) /* Do not allow a single GSO packet to use more * than half of original cwnd. */ retval = SCTP_XMIT_PMTU_FULL; /* Otherwise it will fit in the GSO packet */ } out: return retval; } |
82 82 82 1 82 82 81 82 81 81 10 9 9 10 5 9 8 8 1 3 9 1 5 5 3 3 2 2 2 20 32 30 32 9 19 5 37 1 34 34 1 32 32 2 1 9 11 6 8 6 3 1 1 2 1 1 1 2 1 1 1 1 1 3 1 1 42 1 1 1 1 34 3 2 1 1 1 1 1 47 43 4 14 4 5 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 | // SPDX-License-Identifier: GPL-2.0-only /* * Packet matching code for ARP packets. * * Based heavily, if not almost entirely, upon ip_tables.c framework. * * Some ARP specific bits are: * * Copyright (C) 2002 David S. Miller (davem@redhat.com) * Copyright (C) 2006-2009 Patrick McHardy <kaber@trash.net> * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/capability.h> #include <linux/if_arp.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/err.h> #include <net/compat.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); MODULE_DESCRIPTION("arptables core"); void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); } EXPORT_SYMBOL_GPL(arpt_alloc_initial_table); static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, const char *hdr_addr, int len) { int i, ret; if (len > ARPT_DEV_ADDR_LEN_MAX) len = ARPT_DEV_ADDR_LEN_MAX; ret = 0; for (i = 0; i < len; i++) ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; return ret != 0; } /* * Unfortunately, _b and _mask are not aligned to an int (or long int) * Some arches dont care, unrolling the loop is a win on them. * For other arches, we only have a 16bit alignement. */ static unsigned long ifname_compare(const char *_a, const char *_b, const char *_mask) { #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS unsigned long ret = ifname_compare_aligned(_a, _b, _mask); #else unsigned long ret = 0; const u16 *a = (const u16 *)_a; const u16 *b = (const u16 *)_b; const u16 *mask = (const u16 *)_mask; int i; for (i = 0; i < IFNAMSIZ/sizeof(u16); i++) ret |= (a[i] ^ b[i]) & mask[i]; #endif return ret; } /* Returns whether packet matches rule or not. */ static inline int arp_packet_match(const struct arphdr *arphdr, struct net_device *dev, const char *indev, const char *outdev, const struct arpt_arp *arpinfo) { const char *arpptr = (char *)(arphdr + 1); const char *src_devaddr, *tgt_devaddr; __be32 src_ipaddr, tgt_ipaddr; long ret; if (NF_INVF(arpinfo, ARPT_INV_ARPOP, (arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPHRD, (arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPPRO, (arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro)) return 0; if (NF_INVF(arpinfo, ARPT_INV_ARPHLN, (arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln)) return 0; src_devaddr = arpptr; arpptr += dev->addr_len; memcpy(&src_ipaddr, arpptr, sizeof(u32)); arpptr += sizeof(u32); tgt_devaddr = arpptr; arpptr += dev->addr_len; memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); if (NF_INVF(arpinfo, ARPT_INV_SRCDEVADDR, arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len)) || NF_INVF(arpinfo, ARPT_INV_TGTDEVADDR, arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len))) return 0; if (NF_INVF(arpinfo, ARPT_INV_SRCIP, (src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr) || NF_INVF(arpinfo, ARPT_INV_TGTIP, (tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr)) return 0; /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); if (NF_INVF(arpinfo, ARPT_INV_VIA_IN, ret != 0)) return 0; ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); if (NF_INVF(arpinfo, ARPT_INV_VIA_OUT, ret != 0)) return 0; return 1; } static inline int arp_checkentry(const struct arpt_arp *arp) { if (arp->flags & ~ARPT_F_MASK) return 0; if (arp->invflags & ~ARPT_INV_MASK) return 0; return 1; } static unsigned int arpt_error(struct sk_buff *skb, const struct xt_action_param *par) { net_err_ratelimited("arp_tables: error: '%s'\n", (const char *)par->targinfo); return NF_DROP; } static inline const struct xt_entry_target * arpt_get_target_c(const struct arpt_entry *e) { return arpt_get_target((struct arpt_entry *)e); } static inline struct arpt_entry * get_entry(const void *base, unsigned int offset) { return (struct arpt_entry *)(base + offset); } static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; } unsigned int arpt_do_table(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; const struct arphdr *arp; struct arpt_entry *e, **jumpstack; const char *indev, *outdev; const void *table_base; unsigned int cpu, stackidx = 0; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; if (!pskb_may_pull(skb, arp_hdr_len(skb->dev))) return NF_DROP; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; /* No TEE support for arptables, so no need to switch to alternate * stack. All targets that reenter must return absolute verdicts. */ e = get_entry(table_base, private->hook_entry[hook]); acpar.state = state; acpar.hotdrop = false; arp = arp_hdr(skb); do { const struct xt_entry_target *t; struct xt_counters *counter; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); } else { e = jumpstack[--stackidx]; e = arpt_next_entry(e); } continue; } if (table_base + v != arpt_next_entry(e)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) { /* Target might have changed stuff. */ arp = arp_hdr(skb); e = arpt_next_entry(e); } else { /* Verdict */ break; } } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* All zeroes == unconditional rule. */ static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; return e->target_offset == sizeof(struct arpt_entry) && memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset * to 0 as we leave), and comefrom to save source hook bitmask. */ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last * big jump. */ do { e->comefrom ^= (1<<NF_ARP_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static int check_target(struct arpt_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = arpt_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_ARP, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static int find_check_entry(struct arpt_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); out: xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct arpt_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static inline int check_entry_size_and_hooks(struct arpt_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!arp_checkentry(&e->arp)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct arpt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; t = arpt_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_ARP; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in * newinfo). */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct arpt_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct arpt_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(arpt_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct arpt_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct arpt_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care * about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; struct xt_table_info *private = table->private; int ret = 0; void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } t = arpt_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(NFPROTO_ARP, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(NFPROTO_ARP, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - base; t = arpt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct arpt_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct arpt_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct arpt_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(NFPROTO_ARP, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strscpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif return ret; } static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, const int *len) { int ret; struct arpt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; struct arpt_entry *iter; ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries; xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); } vfree(counters); return ret; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct arpt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct arpt_entry *iter; unsigned int addend; paddc = xt_copy_counters(arg, len, &tmp); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, NFPROTO_ARP, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_arpt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_ARP_NUMHOOKS]; u32 underflow[NF_ARP_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; struct compat_arpt_entry entries[]; }; static inline void compat_release_entry(struct compat_arpt_entry *e) { struct xt_entry_target *t; t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; int ret, off; if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_arpt_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!arp_checkentry(&e->arp)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - (void *)base; t = compat_arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto out; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); if (ret) goto release_target; return 0; release_target: module_put(t->u.kernel.target->me); out: return ret; } static void compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct arpt_entry *de; unsigned int origsize; int h; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct arpt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct arpt_entry); *size += sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); de->target_offset = e->target_offset - (origsize - *size); t = compat_arpt_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_arpt_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_arpt_entry *iter0; struct arpt_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(NFPROTO_ARP); ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_ARP_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone */ xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_ARP_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len) { int ret; struct compat_arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct arpt_entry *iter; if (len < sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; if ((u64)len < (u64)tmp.size + sizeof(tmp)) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_arpt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; int ret; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_arpt_entry); *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); target_offset = e->target_offset - (origsize - *size); t = arpt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct arpt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } struct compat_arpt_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_arpt_entry entrytable[]; }; static int compat_get_entries(struct net *net, struct compat_arpt_get_entries __user *uptr, int *len) { int ret; struct compat_arpt_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); } else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(NFPROTO_ARP); return ret; } #endif static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case ARPT_SO_SET_REPLACE: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else #endif ret = do_replace(sock_net(sk), arg, len); break; case ARPT_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), arg, len); break; default: ret = -EINVAL; } return ret; } static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case ARPT_SO_GET_INFO: ret = get_info(sock_net(sk), user, len); break; case ARPT_SO_GET_ENTRIES: #ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else #endif ret = get_entries(sock_net(sk), user, len); break; case ARPT_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), "arpt_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __arpt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct arpt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *template_ops) { struct nf_hook_ops *ops; unsigned int num_ops; int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) { xt_free_table_info(newinfo); return ret; } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { struct arpt_entry *iter; xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); xt_free_table_info(newinfo); return PTR_ERR(new_table); } num_ops = hweight32(table->valid_hooks); if (num_ops == 0) { ret = -EINVAL; goto out_free; } ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL); if (!ops) { ret = -ENOMEM; goto out_free; } for (i = 0; i < num_ops; i++) ops[i].priv = new_table; new_table->ops = ops; ret = nf_register_net_hooks(net, ops, num_ops); if (ret != 0) goto out_free; return ret; out_free: __arpt_unregister_table(net, new_table); return ret; } void arpt_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); if (table) nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); void arpt_unregister_table(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); if (table) __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ static struct xt_target arpt_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = arpt_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_ARP, }, }; static struct nf_sockopt_ops arpt_sockopts = { .pf = PF_INET, .set_optmin = ARPT_BASE_CTL, .set_optmax = ARPT_SO_SET_MAX+1, .set = do_arpt_set_ctl, .get_optmin = ARPT_BASE_CTL, .get_optmax = ARPT_SO_GET_MAX+1, .get = do_arpt_get_ctl, .owner = THIS_MODULE, }; static int __net_init arp_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_ARP); } static void __net_exit arp_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_ARP); } static struct pernet_operations arp_tables_net_ops = { .init = arp_tables_net_init, .exit = arp_tables_net_exit, }; static int __init arp_tables_init(void) { int ret; ret = register_pernet_subsys(&arp_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); if (ret < 0) goto err2; /* Register setsockopt */ ret = nf_register_sockopt(&arpt_sockopts); if (ret < 0) goto err4; return 0; err4: xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); err2: unregister_pernet_subsys(&arp_tables_net_ops); err1: return ret; } static void __exit arp_tables_fini(void) { nf_unregister_sockopt(&arpt_sockopts); xt_unregister_targets(arpt_builtin_tg, ARRAY_SIZE(arpt_builtin_tg)); unregister_pernet_subsys(&arp_tables_net_ops); } EXPORT_SYMBOL(arpt_register_table); EXPORT_SYMBOL(arpt_unregister_table); EXPORT_SYMBOL(arpt_do_table); module_init(arp_tables_init); module_exit(arp_tables_fini); |
1 1 1 9 12 10 11 4 11 11 2 8 5 11 9 11 6 11 6 12 9 10 10 8 10 5 2 2 9 9 9 12 11 6 12 11 11 8 4 2 2 4 2 2 2 4 4 4 4 4 4 1 1 1 8 8 7 1 4 2 4 4 4 1 1 1 1 1 1 1 12 8 4 2 6 1 6 2 9 9 15 9 6 8 8 2 2 2 1 10 2 8 10 2 8 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook */ #include <linux/cpumask.h> #include <linux/spinlock.h> #include <linux/percpu.h> #include "bpf_lru_list.h" #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET #define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) static int get_next_cpu(int cpu) { cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_possible_mask); return cpu; } /* Local list helpers */ static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_FREE_LIST_IDX]; } static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) { return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; } /* bpf_lru_node helpers */ static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) { return READ_ONCE(node->ref); } static void bpf_lru_node_clear_ref(struct bpf_lru_node *node) { WRITE_ONCE(node->ref, 0); } static void bpf_lru_list_count_inc(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]++; } static void bpf_lru_list_count_dec(struct bpf_lru_list *l, enum bpf_lru_list_type type) { if (type < NR_BPF_LRU_LIST_COUNT) l->counts[type]--; } static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, struct bpf_lru_node *node, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; /* If the removing node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; bpf_lru_list_count_dec(l, node->type); node->type = tgt_free_type; list_move(&node->list, free_list); } /* Move nodes from local list to the LRU list */ static void __bpf_lru_node_move_in(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; bpf_lru_node_clear_ref(node); list_move(&node->list, &l->lists[tgt_type]); } /* Move nodes between or within active and inactive list (like * active to inactive, inactive to active or tail of active back to * the head of active). */ static void __bpf_lru_node_move(struct bpf_lru_list *l, struct bpf_lru_node *node, enum bpf_lru_list_type tgt_type) { if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) return; if (node->type != tgt_type) { bpf_lru_list_count_dec(l, node->type); bpf_lru_list_count_inc(l, tgt_type); node->type = tgt_type; } bpf_lru_node_clear_ref(node); /* If the moving node is the next_inactive_rotation candidate, * move the next_inactive_rotation pointer also. */ if (&node->list == l->next_inactive_rotation) l->next_inactive_rotation = l->next_inactive_rotation->prev; list_move(&node->list, &l->lists[tgt_type]); } static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) { return l->counts[BPF_LRU_LIST_T_INACTIVE] < l->counts[BPF_LRU_LIST_T_ACTIVE]; } /* Rotate the active list: * 1. Start from tail * 2. If the node has the ref bit set, it will be rotated * back to the head of active list with the ref bit cleared. * Give this node one more chance to survive in the active list. * 3. If the ref bit is not set, move it to the head of the * inactive list. * 4. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; struct bpf_lru_node *node, *tmp_node, *first_node; unsigned int i = 0; first_node = list_first_entry(active, struct bpf_lru_node, list); list_for_each_entry_safe_reverse(node, tmp_node, active, list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); if (++i == lru->nr_scans || node == first_node) break; } } /* Rotate the inactive list. It starts from the next_inactive_rotation * 1. If the node has ref bit set, it will be moved to the head * of active list with the ref bit cleared. * 2. If the node does not have ref bit set, it will leave it * at its current location (i.e. do nothing) so that it can * be considered during the next inactive_shrink. * 3. It will at most scan nr_scans nodes */ static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, struct bpf_lru_list *l) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct list_head *cur, *last, *next = inactive; struct bpf_lru_node *node; unsigned int i = 0; if (list_empty(inactive)) return; last = l->next_inactive_rotation->next; if (last == inactive) last = last->next; cur = l->next_inactive_rotation; while (i < lru->nr_scans) { if (cur == inactive) { cur = cur->prev; continue; } node = list_entry(cur, struct bpf_lru_node, list); next = cur->prev; if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); if (cur == last) break; cur = next; i++; } l->next_inactive_rotation = next; } /* Shrink the inactive list. It starts from the tail of the * inactive list and only move the nodes without the ref bit * set to the designated free list. */ static unsigned int __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; struct bpf_lru_node *node, *tmp_node; unsigned int nshrinked = 0; unsigned int i = 0; list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { if (bpf_lru_node_is_ref(node)) { __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); } else if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); if (++nshrinked == tgt_nshrink) break; } if (++i == lru->nr_scans) break; } return nshrinked; } /* 1. Rotate the active list (if needed) * 2. Always rotate the inactive list */ static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) { if (bpf_lru_list_inactive_low(l)) __bpf_lru_list_rotate_active(lru, l); __bpf_lru_list_rotate_inactive(lru, l); } /* Calls __bpf_lru_list_shrink_inactive() to shrink some * ref-bit-cleared nodes and move them to the designated * free list. * * If it cannot get a free node after calling * __bpf_lru_list_shrink_inactive(). It will just remove * one node from either inactive or active list without * honoring the ref-bit. It prefers inactive list to active * list in this situation. */ static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, struct bpf_lru_list *l, unsigned int tgt_nshrink, struct list_head *free_list, enum bpf_lru_list_type tgt_free_type) { struct bpf_lru_node *node, *tmp_node; struct list_head *force_shrink_list; unsigned int nshrinked; nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, free_list, tgt_free_type); if (nshrinked) return nshrinked; /* Do a force shrink by ignoring the reference bit */ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; else force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, list) { if (lru->del_from_htab(lru->del_arg, node)) { __bpf_lru_node_move_to_free(l, node, free_list, tgt_free_type); return 1; } } return 0; } /* Flush the nodes from the local pending list to the LRU list */ static void __local_list_flush(struct bpf_lru_list *l, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node, *tmp_node; list_for_each_entry_safe_reverse(node, tmp_node, local_pending_list(loc_l), list) { if (bpf_lru_node_is_ref(node)) __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); else __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_INACTIVE); } } static void bpf_lru_list_push_free(struct bpf_lru_list *l, struct bpf_lru_node *node) { unsigned long flags; if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) return; raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_list *l = &lru->common_lru.lru_list; struct bpf_lru_node *node, *tmp_node; unsigned int nfree = 0; raw_spin_lock(&l->lock); __local_list_flush(l, loc_l); __bpf_lru_list_rotate(lru, l); list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); if (++nfree == LOCAL_FREE_TARGET) break; } if (nfree < LOCAL_FREE_TARGET) __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); raw_spin_unlock(&l->lock); } static void __local_list_add_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l, int cpu, struct bpf_lru_node *node, u32 hash) { *(u32 *)((void *)node + lru->hash_offset) = hash; node->cpu = cpu; node->type = BPF_LRU_LOCAL_LIST_T_PENDING; bpf_lru_node_clear_ref(node); list_add(&node->list, local_pending_list(loc_l)); } static struct bpf_lru_node * __local_list_pop_free(struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; node = list_first_entry_or_null(local_free_list(loc_l), struct bpf_lru_node, list); if (node) list_del(&node->list); return node; } static struct bpf_lru_node * __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) { struct bpf_lru_node *node; bool force = false; ignore_ref: /* Get from the tail (i.e. older element) of the pending list. */ list_for_each_entry_reverse(node, local_pending_list(loc_l), list) { if ((!bpf_lru_node_is_ref(node) || force) && lru->del_from_htab(lru->del_arg, node)) { list_del(&node->list); return node; } } if (!force) { force = true; goto ignore_ref; } return NULL; } static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct list_head *free_list; struct bpf_lru_node *node = NULL; struct bpf_lru_list *l; unsigned long flags; int cpu = raw_smp_processor_id(); l = per_cpu_ptr(lru->percpu_lru, cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_list_rotate(lru, l); free_list = &l->lists[BPF_LRU_LIST_T_FREE]; if (list_empty(free_list)) __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, BPF_LRU_LIST_T_FREE); if (!list_empty(free_list)) { node = list_first_entry(free_list, struct bpf_lru_node, list); *(u32 *)((void *)node + lru->hash_offset) = hash; bpf_lru_node_clear_ref(node); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); } raw_spin_unlock_irqrestore(&l->lock, flags); return node; } static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, u32 hash) { struct bpf_lru_locallist *loc_l, *steal_loc_l; struct bpf_common_lru *clru = &lru->common_lru; struct bpf_lru_node *node; int steal, first_steal; unsigned long flags; int cpu = raw_smp_processor_id(); loc_l = per_cpu_ptr(clru->local_list, cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); node = __local_list_pop_free(loc_l); if (!node) { bpf_lru_list_pop_free_to_local(lru, loc_l); node = __local_list_pop_free(loc_l); } if (node) __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); if (node) return node; /* No free nodes found from the local free list and * the global LRU list. * * Steal from the local free/pending list of the * current CPU and remote CPU in RR. It starts * with the loc_l->next_steal CPU. */ first_steal = loc_l->next_steal; steal = first_steal; do { steal_loc_l = per_cpu_ptr(clru->local_list, steal); raw_spin_lock_irqsave(&steal_loc_l->lock, flags); node = __local_list_pop_free(steal_loc_l); if (!node) node = __local_list_pop_pending(lru, steal_loc_l); raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); steal = get_next_cpu(steal); } while (!node && steal != first_steal); loc_l->next_steal = steal; if (node) { raw_spin_lock_irqsave(&loc_l->lock, flags); __local_list_add_pending(lru, loc_l, cpu, node, hash); raw_spin_unlock_irqrestore(&loc_l->lock, flags); } return node; } struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) { if (lru->percpu) return bpf_percpu_lru_pop_free(lru, hash); else return bpf_common_lru_pop_free(lru, hash); } static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { u8 node_type = READ_ONCE(node->type); unsigned long flags; if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); raw_spin_lock_irqsave(&loc_l->lock, flags); if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { raw_spin_unlock_irqrestore(&loc_l->lock, flags); goto check_lru_list; } node->type = BPF_LRU_LOCAL_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_move(&node->list, local_free_list(loc_l)); raw_spin_unlock_irqrestore(&loc_l->lock, flags); return; } check_lru_list: bpf_lru_list_push_free(&lru->common_lru.lru_list, node); } static void bpf_percpu_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { struct bpf_lru_list *l; unsigned long flags; l = per_cpu_ptr(lru->percpu_lru, node->cpu); raw_spin_lock_irqsave(&l->lock, flags); __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); raw_spin_unlock_irqrestore(&l->lock, flags); } void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { if (lru->percpu) bpf_percpu_lru_push_free(lru, node); else bpf_common_lru_push_free(lru, node); } static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { struct bpf_lru_list *l = &lru->common_lru.lru_list; u32 i; for (i = 0; i < nr_elems; i++) { struct bpf_lru_node *node; node = (struct bpf_lru_node *)(buf + node_offset); node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { u32 i, pcpu_entries; int cpu; struct bpf_lru_list *l; pcpu_entries = nr_elems / num_possible_cpus(); i = 0; for_each_possible_cpu(cpu) { struct bpf_lru_node *node; l = per_cpu_ptr(lru->percpu_lru, cpu); again: node = (struct bpf_lru_node *)(buf + node_offset); node->cpu = cpu; node->type = BPF_LRU_LIST_T_FREE; bpf_lru_node_clear_ref(node); list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); i++; buf += elem_size; if (i == nr_elems) break; if (i % pcpu_entries) goto again; } } void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, u32 elem_size, u32 nr_elems) { if (lru->percpu) bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, nr_elems); else bpf_common_lru_populate(lru, buf, node_offset, elem_size, nr_elems); } static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) { int i; for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) INIT_LIST_HEAD(&loc_l->lists[i]); loc_l->next_steal = cpu; raw_spin_lock_init(&loc_l->lock); } static void bpf_lru_list_init(struct bpf_lru_list *l) { int i; for (i = 0; i < NR_BPF_LRU_LIST_T; i++) INIT_LIST_HEAD(&l->lists[i]); for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) l->counts[i] = 0; l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; raw_spin_lock_init(&l->lock); } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, del_from_htab_func del_from_htab, void *del_arg) { int cpu; if (percpu) { lru->percpu_lru = alloc_percpu(struct bpf_lru_list); if (!lru->percpu_lru) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_list *l; l = per_cpu_ptr(lru->percpu_lru, cpu); bpf_lru_list_init(l); } lru->nr_scans = PERCPU_NR_SCANS; } else { struct bpf_common_lru *clru = &lru->common_lru; clru->local_list = alloc_percpu(struct bpf_lru_locallist); if (!clru->local_list) return -ENOMEM; for_each_possible_cpu(cpu) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(clru->local_list, cpu); bpf_lru_locallist_init(loc_l, cpu); } bpf_lru_list_init(&clru->lru_list); lru->nr_scans = LOCAL_NR_SCANS; } lru->percpu = percpu; lru->del_from_htab = del_from_htab; lru->del_arg = del_arg; lru->hash_offset = hash_offset; return 0; } void bpf_lru_destroy(struct bpf_lru *lru) { if (lru->percpu) free_percpu(lru->percpu_lru); else free_percpu(lru->common_lru.local_list); } |
5 5 3 3 3 1 6 6 6 6 6 1 1 7 7 7 2 3 2 1 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 | // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/mac80211/cfg.c */ #include <net/rtnetlink.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" #include "cfg.h" static struct net_device * ieee802154_add_iface_deprecated(struct wpan_phy *wpan_phy, const char *name, unsigned char name_assign_type, int type) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct net_device *dev; rtnl_lock(); dev = ieee802154_if_add(local, name, name_assign_type, type, cpu_to_le64(0x0000000000000000ULL)); rtnl_unlock(); return dev; } static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); ieee802154_if_remove(sdata); } #ifdef CONFIG_PM static int ieee802154_suspend(struct wpan_phy *wpan_phy) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); if (!local->open_count) goto suspend; ieee802154_sync_and_hold_queue(local); synchronize_net(); /* stop hardware - this must stop RX */ ieee802154_stop_device(local); suspend: local->suspended = true; return 0; } static int ieee802154_resume(struct wpan_phy *wpan_phy) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); int ret; /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; /* restart hardware */ ret = drv_start(local, local->phy->filtering, &local->addr_filt); if (ret) return ret; wake_up: ieee802154_release_queue(local); local->suspended = false; return 0; } #else #define ieee802154_suspend NULL #define ieee802154_resume NULL #endif static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { struct ieee802154_local *local = wpan_phy_priv(phy); struct net_device *err; err = ieee802154_if_add(local, name, name_assign_type, type, extended_addr); return PTR_ERR_OR_ZERO(err); } static int ieee802154_del_iface(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { ieee802154_if_remove(IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev)); return 0; } static int ieee802154_set_channel(struct wpan_phy *wpan_phy, u8 page, u8 channel) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); int ret; ASSERT_RTNL(); if (wpan_phy->current_page == page && wpan_phy->current_channel == channel) return 0; /* Refuse to change channels during scanning or beaconing */ if (mac802154_is_scanning(local) || mac802154_is_beaconing(local)) return -EBUSY; ret = drv_set_channel(local, page, channel); if (!ret) { wpan_phy->current_page = page; wpan_phy->current_channel = channel; ieee802154_configure_durations(wpan_phy, page, channel); } return ret; } static int ieee802154_set_cca_mode(struct wpan_phy *wpan_phy, const struct wpan_phy_cca *cca) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); int ret; ASSERT_RTNL(); if (wpan_phy_cca_cmp(&wpan_phy->cca, cca)) return 0; ret = drv_set_cca_mode(local, cca); if (!ret) wpan_phy->cca = *cca; return ret; } static int ieee802154_set_cca_ed_level(struct wpan_phy *wpan_phy, s32 ed_level) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); int ret; ASSERT_RTNL(); if (wpan_phy->cca_ed_level == ed_level) return 0; ret = drv_set_cca_ed_level(local, ed_level); if (!ret) wpan_phy->cca_ed_level = ed_level; return ret; } static int ieee802154_set_tx_power(struct wpan_phy *wpan_phy, s32 power) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); int ret; ASSERT_RTNL(); if (wpan_phy->transmit_power == power) return 0; ret = drv_set_tx_power(local, power); if (!ret) wpan_phy->transmit_power = power; return ret; } static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id) { int ret; ASSERT_RTNL(); if (wpan_dev->pan_id == pan_id) return 0; ret = mac802154_wpan_update_llsec(wpan_dev->netdev); if (!ret) wpan_dev->pan_id = pan_id; return ret; } static int ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 min_be, u8 max_be) { ASSERT_RTNL(); wpan_dev->min_be = min_be; wpan_dev->max_be = max_be; return 0; } static int ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 short_addr) { ASSERT_RTNL(); wpan_dev->short_addr = short_addr; return 0; } static int ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, u8 max_csma_backoffs) { ASSERT_RTNL(); wpan_dev->csma_retries = max_csma_backoffs; return 0; } static int ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, s8 max_frame_retries) { ASSERT_RTNL(); wpan_dev->frame_retries = max_frame_retries; return 0; } static int ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode) { ASSERT_RTNL(); wpan_dev->lbt = mode; return 0; } static int ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool ackreq) { ASSERT_RTNL(); wpan_dev->ackreq = ackreq; return 0; } static int mac802154_trigger_scan(struct wpan_phy *wpan_phy, struct cfg802154_scan_request *request) { struct ieee802154_sub_if_data *sdata; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(request->wpan_dev); ASSERT_RTNL(); return mac802154_trigger_scan_locked(sdata, request); } static int mac802154_abort_scan(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct ieee802154_sub_if_data *sdata; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); ASSERT_RTNL(); return mac802154_abort_scan_locked(local, sdata); } static int mac802154_send_beacons(struct wpan_phy *wpan_phy, struct cfg802154_beacon_request *request) { struct ieee802154_sub_if_data *sdata; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(request->wpan_dev); ASSERT_RTNL(); return mac802154_send_beacons_locked(sdata, request); } static int mac802154_stop_beacons(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct ieee802154_sub_if_data *sdata; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); ASSERT_RTNL(); return mac802154_stop_beacons_locked(local, sdata); } static int mac802154_associate(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *coord) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); u64 ceaddr = swab64((__force u64)coord->extended_addr); struct ieee802154_sub_if_data *sdata; struct ieee802154_pan_device *parent; __le16 short_addr; int ret; ASSERT_RTNL(); sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); if (wpan_dev->parent) { dev_err(&sdata->dev->dev, "Device %8phC is already associated\n", &ceaddr); return -EPERM; } if (coord->mode == IEEE802154_SHORT_ADDRESSING) return -EINVAL; parent = kzalloc(sizeof(*parent), GFP_KERNEL); if (!parent) return -ENOMEM; parent->pan_id = coord->pan_id; parent->mode = coord->mode; parent->extended_addr = coord->extended_addr; parent->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST); /* Set the PAN ID hardware address filter beforehand to avoid dropping * the association response with a destination PAN ID field set to the * "new" PAN ID. */ if (local->hw.flags & IEEE802154_HW_AFILT) { ret = drv_set_pan_id(local, coord->pan_id); if (ret < 0) goto free_parent; } ret = mac802154_perform_association(sdata, parent, &short_addr); if (ret) goto reset_panid; if (local->hw.flags & IEEE802154_HW_AFILT) { ret = drv_set_short_addr(local, short_addr); if (ret < 0) goto reset_panid; } wpan_dev->pan_id = coord->pan_id; wpan_dev->short_addr = short_addr; wpan_dev->parent = parent; return 0; reset_panid: if (local->hw.flags & IEEE802154_HW_AFILT) drv_set_pan_id(local, cpu_to_le16(IEEE802154_PAN_ID_BROADCAST)); free_parent: kfree(parent); return ret; } static int mac802154_disassociate_from_parent(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct ieee802154_local *local = wpan_phy_priv(wpan_phy); struct ieee802154_pan_device *child, *tmp; struct ieee802154_sub_if_data *sdata; unsigned int max_assoc; u64 eaddr; int ret; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); /* Start by disassociating all the children and preventing new ones to * attempt associations. */ max_assoc = cfg802154_set_max_associations(wpan_dev, 0); list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) { ret = mac802154_send_disassociation_notif(sdata, child, IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE); if (ret) { eaddr = swab64((__force u64)child->extended_addr); dev_err(&sdata->dev->dev, "Disassociation with %8phC may have failed (%d)\n", &eaddr, ret); } list_del(&child->node); } ret = mac802154_send_disassociation_notif(sdata, wpan_dev->parent, IEEE802154_DEVICE_WISHES_TO_LEAVE); if (ret) { eaddr = swab64((__force u64)wpan_dev->parent->extended_addr); dev_err(&sdata->dev->dev, "Disassociation from %8phC may have failed (%d)\n", &eaddr, ret); } ret = 0; kfree(wpan_dev->parent); wpan_dev->parent = NULL; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PAN_ID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST); if (local->hw.flags & IEEE802154_HW_AFILT) { ret = drv_set_pan_id(local, wpan_dev->pan_id); if (ret < 0) goto reset_mac_assoc; ret = drv_set_short_addr(local, wpan_dev->short_addr); if (ret < 0) goto reset_mac_assoc; } reset_mac_assoc: cfg802154_set_max_associations(wpan_dev, max_assoc); return ret; } static int mac802154_disassociate_child(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_pan_device *child) { struct ieee802154_sub_if_data *sdata; int ret; sdata = IEEE802154_WPAN_DEV_TO_SUB_IF(wpan_dev); ret = mac802154_send_disassociation_notif(sdata, child, IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE); if (ret) return ret; list_del(&child->node); wpan_dev->nchildren--; kfree(child); return 0; } static int mac802154_disassociate(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *target) { u64 teaddr = swab64((__force u64)target->extended_addr); struct ieee802154_pan_device *pan_device; ASSERT_RTNL(); if (cfg802154_device_is_parent(wpan_dev, target)) return mac802154_disassociate_from_parent(wpan_phy, wpan_dev); pan_device = cfg802154_device_is_child(wpan_dev, target); if (pan_device) return mac802154_disassociate_child(wpan_phy, wpan_dev, pan_device); dev_err(&wpan_dev->netdev->dev, "Device %8phC is not associated with us\n", &teaddr); return -EINVAL; } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static void ieee802154_get_llsec_table(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_llsec_table **table) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); *table = &sdata->sec.table; } static void ieee802154_lock_llsec_table(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mutex_lock(&sdata->sec_mtx); } static void ieee802154_unlock_llsec_table(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mutex_unlock(&sdata->sec_mtx); } static int ieee802154_set_llsec_params(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_params *params, int changed) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_set_params(&sdata->sec, params, changed); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_get_llsec_params(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_llsec_params *params) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_get_params(&sdata->sec, params); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_add_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_key_add(&sdata->sec, id, key); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_del_llsec_key(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_key_id *id) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_key_del(&sdata->sec, id); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_add_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_seclevel_add(&sdata->sec, sl); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_del_seclevel(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_seclevel *sl) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_seclevel_del(&sdata->sec, sl); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_add_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct ieee802154_llsec_device *dev_desc) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_dev_add(&sdata->sec, dev_desc); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_del_device(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_dev_del(&sdata->sec, extended_addr); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_add_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *key) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_devkey_add(&sdata->sec, extended_addr, key); mutex_unlock(&sdata->sec_mtx); return res; } static int ieee802154_del_devkey(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *key) { struct net_device *dev = wpan_dev->netdev; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); int res; mutex_lock(&sdata->sec_mtx); res = mac802154_llsec_devkey_del(&sdata->sec, extended_addr, key); mutex_unlock(&sdata->sec_mtx); return res; } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, .suspend = ieee802154_suspend, .resume = ieee802154_resume, .add_virtual_intf = ieee802154_add_iface, .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, .set_cca_mode = ieee802154_set_cca_mode, .set_cca_ed_level = ieee802154_set_cca_ed_level, .set_tx_power = ieee802154_set_tx_power, .set_pan_id = ieee802154_set_pan_id, .set_short_addr = ieee802154_set_short_addr, .set_backoff_exponent = ieee802154_set_backoff_exponent, .set_max_csma_backoffs = ieee802154_set_max_csma_backoffs, .set_max_frame_retries = ieee802154_set_max_frame_retries, .set_lbt_mode = ieee802154_set_lbt_mode, .set_ackreq_default = ieee802154_set_ackreq_default, .trigger_scan = mac802154_trigger_scan, .abort_scan = mac802154_abort_scan, .send_beacons = mac802154_send_beacons, .stop_beacons = mac802154_stop_beacons, .associate = mac802154_associate, .disassociate = mac802154_disassociate, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL .get_llsec_table = ieee802154_get_llsec_table, .lock_llsec_table = ieee802154_lock_llsec_table, .unlock_llsec_table = ieee802154_unlock_llsec_table, /* TODO above */ .set_llsec_params = ieee802154_set_llsec_params, .get_llsec_params = ieee802154_get_llsec_params, .add_llsec_key = ieee802154_add_llsec_key, .del_llsec_key = ieee802154_del_llsec_key, .add_seclevel = ieee802154_add_seclevel, .del_seclevel = ieee802154_del_seclevel, .add_device = ieee802154_add_device, .del_device = ieee802154_del_device, .add_devkey = ieee802154_add_devkey, .del_devkey = ieee802154_del_devkey, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; |
12 2 10 1 1 1 5 5 2 3 1 1 1 3 3 3 1 12 11 1 1 1 1 1 2 1 1 1 1 2 1 11 3 3 1 1 1 3 3 4 4 1 3 1 1 1 2 2 1 1 1 1 7 7 7 1 6 6 6 6 8 3 5 1 4 3 1 1 3 5 5 5 2 3 2 1 1 9 9 8 1 1 1 5 1 1 3 3 2 1 1 1 1 5 5 5 5 5 3 3 5 4 4 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/nfc.h> #include <linux/sched/signal.h> #include "nfc.h" #include "llcp.h" static int sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; pr_debug("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } static struct proto llcp_sock_proto = { .name = "NFC_LLCP", .owner = THIS_MODULE, .obj_size = sizeof(struct nfc_llcp_sock), }; static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); /* This is going to be a listening socket, dsap must be 0 */ if (llcp_addr.dsap != 0) return -EINVAL; lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, llcp_addr.service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(llcp_addr.service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_put_local; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -EADDRINUSE; goto free_service_name; } llcp_sock->reserved_ssap = llcp_sock->ssap; nfc_llcp_sock_link(&local->sockets, sk); pr_debug("Socket bound to SAP %d\n", llcp_sock->ssap); sk->sk_state = LLCP_BOUND; nfc_put_device(dev); release_sock(sk); return 0; free_service_name: kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_put_local: nfc_llcp_local_put(llcp_sock->local); llcp_sock->local = NULL; llcp_sock->dev = NULL; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_local *local; struct nfc_dev *dev; struct sockaddr_nfc_llcp llcp_addr; int len, ret = 0; if (!addr || alen < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_NFC) return -EINVAL; pr_debug("sk %p addr %p family %d\n", sk, addr, addr->sa_family); memset(&llcp_addr, 0, sizeof(llcp_addr)); len = min_t(unsigned int, sizeof(llcp_addr), alen); memcpy(&llcp_addr, addr, len); lock_sock(sk); if (sk->sk_state != LLCP_CLOSED) { ret = -EBADFD; goto error; } dev = nfc_get_device(llcp_addr.dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->nfc_protocol = llcp_addr.nfc_protocol; nfc_llcp_sock_link(&local->raw_sockets, sk); sk->sk_state = LLCP_BOUND; put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int ret = 0; pr_debug("sk %p backlog %d\n", sk, backlog); lock_sock(sk); if ((sock->type != SOCK_SEQPACKET && sock->type != SOCK_STREAM) || sk->sk_state != LLCP_BOUND) { ret = -EBADFD; goto error; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; pr_debug("Socket listening\n"); sk->sk_state = LLCP_LISTEN; error: release_sock(sk); return ret; } static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); u32 opt; int err = 0; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case NFC_LLCP_RW: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_RW) { err = -EINVAL; break; } llcp_sock->rw = (u8) opt; break; case NFC_LLCP_MIUX: if (sk->sk_state == LLCP_CONNECTED || sk->sk_state == LLCP_BOUND || sk->sk_state == LLCP_LISTEN) { err = -EINVAL; break; } err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; if (opt > LLCP_MAX_MIUX) { err = -EINVAL; break; } llcp_sock->miux = cpu_to_be16((u16) opt); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); pr_debug("%p rw %d miux %d\n", llcp_sock, llcp_sock->rw, llcp_sock->miux); return err; } static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct nfc_llcp_local *local; struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int len, err = 0; u16 miux, remote_miu; u8 rw; pr_debug("%p optname %d\n", sk, optname); if (level != SOL_NFC) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; local = llcp_sock->local; if (!local) return -ENODEV; len = min_t(u32, len, sizeof(u32)); lock_sock(sk); switch (optname) { case NFC_LLCP_RW: rw = llcp_sock->rw > LLCP_MAX_RW ? local->rw : llcp_sock->rw; if (put_user(rw, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_MIUX: miux = be16_to_cpu(llcp_sock->miux) > LLCP_MAX_MIUX ? be16_to_cpu(local->miux) : be16_to_cpu(llcp_sock->miux); if (put_user(miux, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_MIU: remote_miu = llcp_sock->remote_miu > LLCP_MAX_MIU ? local->remote_miu : llcp_sock->remote_miu; if (put_user(remote_miu, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_LTO: if (put_user(local->remote_lto / 10, (u32 __user *) optval)) err = -EFAULT; break; case NFC_LLCP_REMOTE_RW: if (put_user(llcp_sock->remote_rw, (u32 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); if (put_user(len, optlen)) return -EFAULT; return err; } void nfc_llcp_accept_unlink(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("state %d\n", sk->sk_state); list_del_init(&llcp_sock->accept_queue); sk_acceptq_removed(llcp_sock->parent); llcp_sock->parent = NULL; sock_put(sk); } void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct nfc_llcp_sock *llcp_sock_parent = nfc_llcp_sock(parent); /* Lock will be free from unlink */ sock_hold(sk); list_add_tail(&llcp_sock->accept_queue, &llcp_sock_parent->accept_queue); llcp_sock->parent = parent; sk_acceptq_added(parent); } struct sock *nfc_llcp_accept_dequeue(struct sock *parent, struct socket *newsock) { struct nfc_llcp_sock *lsk, *n, *llcp_parent; struct sock *sk; llcp_parent = nfc_llcp_sock(parent); list_for_each_entry_safe(lsk, n, &llcp_parent->accept_queue, accept_queue) { sk = &lsk->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_accept_unlink(sk); continue; } if (sk->sk_state == LLCP_CONNECTED || !newsock) { list_del_init(&lsk->accept_queue); sock_put(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); pr_debug("Returning sk state %d\n", sk->sk_state); sk_acceptq_removed(parent); return sk; } release_sock(sk); } return NULL; } static int llcp_sock_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { DECLARE_WAITQUEUE(wait, current); struct sock *sk = sock->sk, *new_sk; long timeo; int ret = 0; pr_debug("parent %p\n", sk); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_state != LLCP_LISTEN) { ret = -EBADFD; goto error; } timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); /* Wait for an incoming connection. */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(new_sk = nfc_llcp_accept_dequeue(sk, newsock))) { set_current_state(TASK_INTERRUPTIBLE); if (!timeo) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); if (ret) goto error; newsock->state = SS_CONNECTED; pr_debug("new socket %p\n", new_sk); error: release_sock(sk); return ret; } static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, llcp_addr, uaddr); if (llcp_sock == NULL || llcp_sock->dev == NULL) return -EBADFD; pr_debug("%p %d %d %d\n", sk, llcp_sock->target_idx, llcp_sock->dsap, llcp_sock->ssap); memset(llcp_addr, 0, sizeof(*llcp_addr)); lock_sock(sk); if (!llcp_sock->dev) { release_sock(sk); return -EBADFD; } llcp_addr->sa_family = AF_NFC; llcp_addr->dev_idx = llcp_sock->dev->idx; llcp_addr->target_idx = llcp_sock->target_idx; llcp_addr->nfc_protocol = llcp_sock->nfc_protocol; llcp_addr->dsap = llcp_sock->dsap; llcp_addr->ssap = llcp_sock->ssap; llcp_addr->service_name_len = llcp_sock->service_name_len; memcpy(llcp_addr->service_name, llcp_sock->service_name, llcp_addr->service_name_len); release_sock(sk); return sizeof(struct sockaddr_nfc_llcp); } static inline __poll_t llcp_accept_poll(struct sock *parent) { struct nfc_llcp_sock *llcp_sock, *parent_sock; struct sock *sk; parent_sock = nfc_llcp_sock(parent); list_for_each_entry(llcp_sock, &parent_sock->accept_queue, accept_queue) { sk = &llcp_sock->sk; if (sk->sk_state == LLCP_CONNECTED) return EPOLLIN | EPOLLRDNORM; } return 0; } static __poll_t llcp_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; pr_debug("%p\n", sk); sock_poll_wait(file, sock, wait); if (sk->sk_state == LLCP_LISTEN) return llcp_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == LLCP_CLOSED) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); pr_debug("mask 0x%x\n", mask); return mask; } static int llcp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; struct nfc_llcp_local *local; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int err = 0; if (!sk) return 0; pr_debug("%p\n", sk); local = llcp_sock->local; if (local == NULL) { err = -ENODEV; goto out; } lock_sock(sk); /* Send a DISC */ if (sk->sk_state == LLCP_CONNECTED) nfc_llcp_send_disconnect(llcp_sock); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; lock_sock(accept_sk); nfc_llcp_send_disconnect(lsk); nfc_llcp_accept_unlink(accept_sk); release_sock(accept_sk); } } if (sock->type == SOCK_RAW) nfc_llcp_sock_unlink(&local->raw_sockets, sk); else nfc_llcp_sock_unlink(&local->sockets, sk); if (llcp_sock->reserved_ssap < LLCP_SAP_MAX) nfc_llcp_put_ssap(llcp_sock->local, llcp_sock->ssap); release_sock(sk); out: sock_orphan(sk); sock_put(sk); return err; } static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, int len, int flags) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); struct sockaddr_nfc_llcp *addr = (struct sockaddr_nfc_llcp *)_addr; struct nfc_dev *dev; struct nfc_llcp_local *local; int ret = 0; pr_debug("sock %p sk %p flags 0x%x\n", sock, sk, flags); if (!addr || len < sizeof(*addr) || addr->sa_family != AF_NFC) return -EINVAL; if (addr->service_name_len == 0 && addr->dsap == 0) return -EINVAL; pr_debug("addr dev_idx=%u target_idx=%u protocol=%u\n", addr->dev_idx, addr->target_idx, addr->nfc_protocol); lock_sock(sk); if (sk->sk_state == LLCP_CONNECTED) { ret = -EISCONN; goto error; } if (sk->sk_state == LLCP_CONNECTING) { ret = -EINPROGRESS; goto error; } dev = nfc_get_device(addr->dev_idx); if (dev == NULL) { ret = -ENODEV; goto error; } local = nfc_llcp_find_local(dev); if (local == NULL) { ret = -ENODEV; goto put_dev; } device_lock(&dev->dev); if (dev->dep_link_up == false) { ret = -ENOLINK; device_unlock(&dev->dev); goto sock_llcp_put_local; } device_unlock(&dev->dev); if (local->rf_mode == NFC_RF_INITIATOR && addr->target_idx != local->target_idx) { ret = -ENOLINK; goto sock_llcp_put_local; } llcp_sock->dev = dev; llcp_sock->local = local; llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { ret = -ENOMEM; goto sock_llcp_nullify; } llcp_sock->reserved_ssap = llcp_sock->ssap; if (addr->service_name_len == 0) llcp_sock->dsap = addr->dsap; else llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->nfc_protocol = addr->nfc_protocol; llcp_sock->service_name_len = min_t(unsigned int, addr->service_name_len, NFC_LLCP_MAX_SERVICE_NAME); llcp_sock->service_name = kmemdup(addr->service_name, llcp_sock->service_name_len, GFP_KERNEL); if (!llcp_sock->service_name) { ret = -ENOMEM; goto sock_llcp_release; } nfc_llcp_sock_link(&local->connecting_sockets, sk); ret = nfc_llcp_send_connect(llcp_sock); if (ret) goto sock_unlink; sk->sk_state = LLCP_CONNECTING; ret = sock_wait_state(sk, LLCP_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); if (ret && ret != -EINPROGRESS) goto sock_unlink; release_sock(sk); return ret; sock_unlink: nfc_llcp_sock_unlink(&local->connecting_sockets, sk); kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); sock_llcp_nullify: llcp_sock->local = NULL; llcp_sock->dev = NULL; sock_llcp_put_local: nfc_llcp_local_put(local); put_dev: nfc_put_device(dev); error: release_sock(sk); return ret; } static int llcp_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); int ret; pr_debug("sock %p sk %p", sock, sk); ret = sock_error(sk); if (ret) return ret; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (!llcp_sock->local) { release_sock(sk); return -ENODEV; } if (sk->sk_type == SOCK_DGRAM) { if (sk->sk_state != LLCP_BOUND) { release_sock(sk); return -ENOTCONN; } DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) { release_sock(sk); return -EINVAL; } release_sock(sk); return nfc_llcp_send_ui_frame(llcp_sock, addr->dsap, addr->ssap, msg, len); } if (sk->sk_state != LLCP_CONNECTED) { release_sock(sk); return -ENOTCONN; } release_sock(sk); return nfc_llcp_send_i_frame(llcp_sock, msg, len); } static int llcp_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; unsigned int copied, rlen; struct sk_buff *skb, *cskb; int err = 0; pr_debug("%p %zu\n", sk, len); lock_sock(sk); if (sk->sk_state == LLCP_CLOSED && skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); return 0; } release_sock(sk); if (flags & (MSG_OOB)) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, &err); if (!skb) { pr_err("Recv datagram failed state %d %d %d", sk->sk_state, err, sock_error(sk)); if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; return err; } rlen = skb->len; /* real length of skb */ copied = min_t(unsigned int, rlen, len); cskb = skb; if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; } sock_recv_timestamp(msg, sk, skb); if (sk->sk_type == SOCK_DGRAM && msg->msg_name) { struct nfc_llcp_ui_cb *ui_cb = nfc_llcp_ui_skb_cb(skb); DECLARE_SOCKADDR(struct sockaddr_nfc_llcp *, sockaddr, msg->msg_name); msg->msg_namelen = sizeof(struct sockaddr_nfc_llcp); pr_debug("Datagram socket %d %d\n", ui_cb->dsap, ui_cb->ssap); memset(sockaddr, 0, sizeof(*sockaddr)); sockaddr->sa_family = AF_NFC; sockaddr->nfc_protocol = NFC_PROTO_NFC_DEP; sockaddr->dsap = ui_cb->dsap; sockaddr->ssap = ui_cb->ssap; } /* Mark read part of skb as used */ if (!(flags & MSG_PEEK)) { /* SOCK_STREAM: re-queue skb if it contains unreceived data */ if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { skb_pull(skb, copied); if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); goto done; } } kfree_skb(skb); } /* XXX Queue backlogged skbs */ done: /* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */ if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC)) copied = rlen; return copied; } static const struct proto_ops llcp_sock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_sock_bind, .connect = llcp_sock_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = llcp_sock_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = llcp_sock_listen, .shutdown = sock_no_shutdown, .setsockopt = nfc_llcp_setsockopt, .getsockopt = nfc_llcp_getsockopt, .sendmsg = llcp_sock_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops llcp_rawsock_ops = { .family = PF_NFC, .owner = THIS_MODULE, .bind = llcp_raw_sock_bind, .connect = sock_no_connect, .release = llcp_sock_release, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = llcp_sock_getname, .poll = llcp_sock_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = sock_no_sendmsg, .recvmsg = llcp_sock_recvmsg, .mmap = sock_no_mmap, }; static void llcp_sock_destruct(struct sock *sk) { struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); pr_debug("%p\n", sk); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); skb_queue_purge(&sk->sk_receive_queue); nfc_llcp_sock_free(llcp_sock); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Freeing alive NFC LLCP socket %p\n", sk); return; } } struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern); if (!sk) return NULL; llcp_sock = nfc_llcp_sock(sk); sock_init_data(sock, sk); sk->sk_state = LLCP_CLOSED; sk->sk_protocol = NFC_SOCKPROTO_LLCP; sk->sk_type = type; sk->sk_destruct = llcp_sock_destruct; llcp_sock->ssap = 0; llcp_sock->dsap = LLCP_SAP_SDP; llcp_sock->rw = LLCP_MAX_RW + 1; llcp_sock->miux = cpu_to_be16(LLCP_MAX_MIUX + 1); llcp_sock->send_n = llcp_sock->send_ack_n = 0; llcp_sock->recv_n = llcp_sock->recv_ack_n = 0; llcp_sock->remote_ready = 1; llcp_sock->reserved_ssap = LLCP_SAP_MAX; nfc_llcp_socket_remote_param_init(llcp_sock); skb_queue_head_init(&llcp_sock->tx_queue); skb_queue_head_init(&llcp_sock->tx_pending_queue); INIT_LIST_HEAD(&llcp_sock->accept_queue); if (sock != NULL) sock->state = SS_UNCONNECTED; return sk; } void nfc_llcp_sock_free(struct nfc_llcp_sock *sock) { kfree(sock->service_name); skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); list_del_init(&sock->accept_queue); sock->parent = NULL; nfc_llcp_local_put(sock->local); } static int llcp_sock_create(struct net *net, struct socket *sock, const struct nfc_protocol *nfc_proto, int kern) { struct sock *sk; pr_debug("%p\n", sock); if (sock->type != SOCK_STREAM && sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (sock->type == SOCK_RAW) { if (!capable(CAP_NET_RAW)) return -EPERM; sock->ops = &llcp_rawsock_ops; } else { sock->ops = &llcp_sock_ops; } sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern); if (sk == NULL) return -ENOMEM; return 0; } static const struct nfc_protocol llcp_nfc_proto = { .id = NFC_SOCKPROTO_LLCP, .proto = &llcp_sock_proto, .owner = THIS_MODULE, .create = llcp_sock_create }; int __init nfc_llcp_sock_init(void) { return nfc_proto_register(&llcp_nfc_proto); } void nfc_llcp_sock_exit(void) { nfc_proto_unregister(&llcp_nfc_proto); } |
8523 55 8819 11 8496 8531 56 3 1 1 1 2 2 390 3 2 394 56 59 53 6 372 47 47 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NET_SCM_H #define __LINUX_NET_SCM_H #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> #include <net/compat.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ #define SCM_MAX_FD 253 struct scm_creds { u32 pid; kuid_t uid; kgid_t gid; }; #ifdef CONFIG_UNIX struct unix_edge; #endif struct scm_fp_list { short count; short count_unix; short max; #ifdef CONFIG_UNIX bool inflight; bool dead; struct list_head vertices; struct unix_edge *edges; #endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; struct scm_cookie { struct pid *pid; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ struct scm_creds creds; /* Skb credentials */ #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Passed security ID */ #endif }; void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); void __scm_destroy(struct scm_cookie *scm); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl); #ifdef CONFIG_SECURITY_NETWORK static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { security_socket_getpeersec_dgram(sock, NULL, &scm->secid); } #else static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) { scm_destroy_cred(scm); if (scm->fp) __scm_destroy(scm); } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); scm->creds.uid = INVALID_UID; scm->creds.gid = INVALID_GID; if (forcecreds) scm_set_cred(scm, task_tgid(current), current_uid(), current_gid()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); } #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { char *secdata; u32 seclen; int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { err = security_secid_to_secctx(scm->secid, &secdata, &seclen); if (!err) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); security_release_secctx(secdata, seclen); } } } static inline bool scm_has_secdata(struct socket *sock) { return test_bit(SOCK_PASSSEC, &sock->flags); } #else static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { } static inline bool scm_has_secdata(struct socket *sock) { return false; } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) { struct file *pidfd_file = NULL; int len, pidfd; /* put_cmsg() doesn't return an error if CMSG is truncated, * that's why we need to opencode these checks here. */ if (msg->msg_flags & MSG_CMSG_COMPAT) len = sizeof(struct compat_cmsghdr) + sizeof(int); else len = sizeof(struct cmsghdr) + sizeof(int); if (msg->msg_controllen < len) { msg->msg_flags |= MSG_CTRUNC; return; } if (!scm->pid) return; pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { if (pidfd_file) { put_unused_fd(pidfd); fput(pidfd_file); } return; } if (pidfd_file) fd_install(pidfd, pidfd_file); } static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || test_bit(SOCK_PASSPIDFD, &sock->flags) || scm->fp || scm_has_secdata(sock)) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return false; } if (test_bit(SOCK_PASSCRED, &sock->flags)) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, .uid = from_kuid_munged(current_ns, scm->creds.uid), .gid = from_kgid_munged(current_ns, scm->creds.gid), }; put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } scm_passec(sock, msg, scm); if (scm->fp) scm_detach_fds(msg, scm); return true; } static inline void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock, msg, scm, flags)) return; scm_destroy_cred(scm); } static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!__scm_recv_common(sock, msg, scm, flags)) return; if (test_bit(SOCK_PASSPIDFD, &sock->flags)) scm_pidfd_recv(msg, scm); scm_destroy_cred(scm); } static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) { if (!ufd) return -EFAULT; return receive_fd(f, ufd, flags); } #endif /* __LINUX_NET_SCM_H */ |
678 679 6 6 6 6 6 880 315 218 1314 1309 4 663 4 6 6 6 6 6 6 6 6 6 6 6 5 1 1 1 1 6 5 1 1 7 1309 6 1322 1330 16 1314 22 1309 1318 1322 662 664 662 46 622 177 1155 1155 1157 1160 485 487 486 8 5 4 1 4 484 3 3 3 893 21239 3 21313 21285 21315 21302 240 12 9 9 170 5 880 21153 19695 21338 102 104 1661 144 1818 1817 1808 141 3 3 5 1 4 1 4 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && (file_count(file) > 1 || file->f_op->iterate_shared); } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (file && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. NB: OpenBSD avoids that at the price of * extra work in their equivalent of fget() - they insert struct * file immediately after grabbing descriptor, mark it larval if * more work (e.g. actual opening) is needed and make sure that * fget() treats larval files as absent. Potentially interesting, * but while extra work in fget() is trivial, locking implications * and amount of surgery on open()-related paths in VFS are not. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" * deadlocks in rather amusing ways, AFAICS. All of that is out of * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Compression operations * * Copyright 2015 LG Electronics Inc. * Copyright (c) 2016, Intel Corporation * Author: Giovanni Cabiddu <giovanni.cabiddu@intel.com> */ #include <crypto/internal/acompress.h> #include <crypto/internal/scompress.h> #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include "compress.h" struct scomp_scratch { spinlock_t lock; void *src; void *dst; }; static DEFINE_PER_CPU(struct scomp_scratch, scomp_scratch) = { .lock = __SPIN_LOCK_UNLOCKED(scomp_scratch.lock), }; static const struct crypto_type crypto_scomp_type; static int scomp_scratch_users; static DEFINE_MUTEX(scomp_lock); static int __maybe_unused crypto_scomp_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_comp rscomp; memset(&rscomp, 0, sizeof(rscomp)); strscpy(rscomp.type, "scomp", sizeof(rscomp.type)); return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rscomp), &rscomp); } static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_scomp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : scomp\n"); } static void crypto_scomp_free_scratches(void) { struct scomp_scratch *scratch; int i; for_each_possible_cpu(i) { scratch = per_cpu_ptr(&scomp_scratch, i); vfree(scratch->src); vfree(scratch->dst); scratch->src = NULL; scratch->dst = NULL; } } static int crypto_scomp_alloc_scratches(void) { struct scomp_scratch *scratch; int i; for_each_possible_cpu(i) { void *mem; scratch = per_cpu_ptr(&scomp_scratch, i); mem = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i)); if (!mem) goto error; scratch->src = mem; mem = vmalloc_node(SCOMP_SCRATCH_SIZE, cpu_to_node(i)); if (!mem) goto error; scratch->dst = mem; } return 0; error: crypto_scomp_free_scratches(); return -ENOMEM; } static int crypto_scomp_init_tfm(struct crypto_tfm *tfm) { int ret = 0; mutex_lock(&scomp_lock); if (!scomp_scratch_users++) ret = crypto_scomp_alloc_scratches(); mutex_unlock(&scomp_lock); return ret; } static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); void **tfm_ctx = acomp_tfm_ctx(tfm); struct crypto_scomp *scomp = *tfm_ctx; void **ctx = acomp_request_ctx(req); struct scomp_scratch *scratch; void *src, *dst; unsigned int dlen; int ret; if (!req->src || !req->slen || req->slen > SCOMP_SCRATCH_SIZE) return -EINVAL; if (req->dst && !req->dlen) return -EINVAL; if (!req->dlen || req->dlen > SCOMP_SCRATCH_SIZE) req->dlen = SCOMP_SCRATCH_SIZE; dlen = req->dlen; scratch = raw_cpu_ptr(&scomp_scratch); spin_lock(&scratch->lock); if (sg_nents(req->src) == 1 && !PageHighMem(sg_page(req->src))) { src = page_to_virt(sg_page(req->src)) + req->src->offset; } else { scatterwalk_map_and_copy(scratch->src, req->src, 0, req->slen, 0); src = scratch->src; } if (req->dst && sg_nents(req->dst) == 1 && !PageHighMem(sg_page(req->dst))) dst = page_to_virt(sg_page(req->dst)) + req->dst->offset; else dst = scratch->dst; if (dir) ret = crypto_scomp_compress(scomp, src, req->slen, dst, &req->dlen, *ctx); else ret = crypto_scomp_decompress(scomp, src, req->slen, dst, &req->dlen, *ctx); if (!ret) { if (!req->dst) { req->dst = sgl_alloc(req->dlen, GFP_ATOMIC, NULL); if (!req->dst) { ret = -ENOMEM; goto out; } } else if (req->dlen > dlen) { ret = -ENOSPC; goto out; } if (dst == scratch->dst) { scatterwalk_map_and_copy(scratch->dst, req->dst, 0, req->dlen, 1); } else { int nr_pages = DIV_ROUND_UP(req->dst->offset + req->dlen, PAGE_SIZE); int i; struct page *dst_page = sg_page(req->dst); for (i = 0; i < nr_pages; i++) flush_dcache_page(dst_page + i); } } out: spin_unlock(&scratch->lock); return ret; } static int scomp_acomp_compress(struct acomp_req *req) { return scomp_acomp_comp_decomp(req, 1); } static int scomp_acomp_decompress(struct acomp_req *req) { return scomp_acomp_comp_decomp(req, 0); } static void crypto_exit_scomp_ops_async(struct crypto_tfm *tfm) { struct crypto_scomp **ctx = crypto_tfm_ctx(tfm); crypto_free_scomp(*ctx); mutex_lock(&scomp_lock); if (!--scomp_scratch_users) crypto_scomp_free_scratches(); mutex_unlock(&scomp_lock); } int crypto_init_scomp_ops_async(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct crypto_acomp *crt = __crypto_acomp_tfm(tfm); struct crypto_scomp **ctx = crypto_tfm_ctx(tfm); struct crypto_scomp *scomp; if (!crypto_mod_get(calg)) return -EAGAIN; scomp = crypto_create_tfm(calg, &crypto_scomp_type); if (IS_ERR(scomp)) { crypto_mod_put(calg); return PTR_ERR(scomp); } *ctx = scomp; tfm->exit = crypto_exit_scomp_ops_async; crt->compress = scomp_acomp_compress; crt->decompress = scomp_acomp_decompress; crt->dst_free = sgl_free; crt->reqsize = sizeof(void *); return 0; } struct acomp_req *crypto_acomp_scomp_alloc_ctx(struct acomp_req *req) { struct crypto_acomp *acomp = crypto_acomp_reqtfm(req); struct crypto_tfm *tfm = crypto_acomp_tfm(acomp); struct crypto_scomp **tfm_ctx = crypto_tfm_ctx(tfm); struct crypto_scomp *scomp = *tfm_ctx; void *ctx; ctx = crypto_scomp_alloc_ctx(scomp); if (IS_ERR(ctx)) { kfree(req); return NULL; } *req->__ctx = ctx; return req; } void crypto_acomp_scomp_free_ctx(struct acomp_req *req) { struct crypto_acomp *acomp = crypto_acomp_reqtfm(req); struct crypto_tfm *tfm = crypto_acomp_tfm(acomp); struct crypto_scomp **tfm_ctx = crypto_tfm_ctx(tfm); struct crypto_scomp *scomp = *tfm_ctx; void *ctx = *req->__ctx; if (ctx) crypto_scomp_free_ctx(scomp, ctx); } static const struct crypto_type crypto_scomp_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_scomp_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_scomp_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_scomp_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SCOMPRESS, .tfmsize = offsetof(struct crypto_scomp, base), }; int crypto_register_scomp(struct scomp_alg *alg) { struct crypto_alg *base = &alg->calg.base; comp_prepare_alg(&alg->calg); base->cra_type = &crypto_scomp_type; base->cra_flags |= CRYPTO_ALG_TYPE_SCOMPRESS; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_scomp); void crypto_unregister_scomp(struct scomp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_scomp); int crypto_register_scomps(struct scomp_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_scomp(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_scomp(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_scomps); void crypto_unregister_scomps(struct scomp_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_scomp(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_scomps); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous compression type"); |
26 25 2 1 24 1 36 36 8 4 23 14 9 5 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_queue.h> #include <net/ip6_checksum.h> #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP && !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len - dataoff, protocol, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; else skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len - dataoff, protocol, 0); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip_checksum); #endif static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(skb->csum, skb_checksum(skb, 0, dataoff, 0)))) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, skb_checksum(skb, 0, dataoff, 0)))); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip6_checksum); static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum hsum; __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, hsum))); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; }; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum_partial(skb, hook, dataoff, len, protocol); break; case AF_INET6: csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum_partial); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family) { const struct nf_ipv6_ops *v6ops __maybe_unused; int ret = 0; switch (family) { case AF_INET: ret = nf_ip_route(net, dst, fl, strict); break; case AF_INET6: ret = nf_ip6_route(net, dst, fl, strict); break; } return ret; } EXPORT_SYMBOL_GPL(nf_route); /* Only get and check the lengths, not do any hop-by-hop stuff. */ int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { int len, off = sizeof(struct ipv6hdr); unsigned char *nh; if (!pskb_may_pull(skb, off + 8)) return -ENOMEM; nh = (unsigned char *)(ipv6_hdr(skb) + 1); len = (nh[1] + 1) << 3; if (!pskb_may_pull(skb, off + len)) return -ENOMEM; nh = skb_network_header(skb); off += 2; len -= 2; while (len > 0) { int optlen; if (nh[off] == IPV6_TLV_PAD1) { off++; len--; continue; } if (len < 2) return -EBADMSG; optlen = nh[off + 1] + 2; if (optlen > len) return -EBADMSG; if (nh[off] == IPV6_TLV_JUMBO) { u32 pkt_len; if (nh[off + 1] != 4 || (off & 3) != 2) return -EBADMSG; pkt_len = ntohl(*(__be32 *)(nh + off + 2)); if (pkt_len <= IPV6_MAXPLEN || ipv6_hdr(skb)->payload_len) return -EBADMSG; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) return -EBADMSG; *plen = pkt_len; } off += optlen; len -= optlen; } return len ? -EBADMSG : 0; } EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len); |
5 5 5 5 10 10 10 6 6 6 6 6 18 18 18 18 11 11 11 11 31 31 31 31 31 31 31 2 2 2 2 2 2 22 22 22 22 22 22 10 10 10 10 10 10 11 11 11 11 11 11 11 34 36 36 35 36 36 36 1 1 1 1 1 1 1 60 60 61 61 58 60 61 60 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH * Copyright (C) 2022-2024 Intel Corporation */ #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" #include "debugfs_sta.h" #include "debugfs_netdev.h" int drv_start(struct ieee80211_local *local) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(local->started)) return -EALREADY; trace_drv_start(local); local->started = true; /* allow rx frames */ smp_mb(); ret = local->ops->start(&local->hw); trace_drv_return_int(local, ret); if (ret) local->started = false; return ret; } void drv_stop(struct ieee80211_local *local, bool suspend) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->started)) return; trace_drv_stop(local, suspend); local->ops->stop(&local->hw, suspend); trace_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; trace_drv_add_interface(local, sdata); ret = local->ops->add_interface(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); if (ret) return ret; if (!(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) { sdata->flags |= IEEE80211_SDATA_IN_DRIVER; drv_vif_add_debugfs(local, sdata); /* initially vif is not MLD */ ieee80211_link_debugfs_drv_add(&sdata->deflink); } return 0; } int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_change_interface(local, sdata, type, p2p); ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); trace_drv_return_int(local, ret); return ret; } void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; /* Remove driver debugfs entries */ ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links); trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); if (local->ops->sta_state) { ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, old_state, new_state); } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); if (ret == 0) { sta->uploaded = true; if (rcu_access_pointer(sta->sta.rates)) drv_sta_rate_tbl_update(local, sdata, &sta->sta); } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); } trace_drv_return_int(local, ret); return ret; } __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_set_txpwr(local, sdata, &sta->sta); if (local->ops->sta_set_txpwr) ret = local->ops->sta_set_txpwr(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_int(local, ret); return ret; } void drv_link_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_link_sta *link_sta, u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); trace_drv_link_sta_rc_update(local, sdata, link_sta, changed); if (local->ops->link_sta_rc_update) local->ops->link_sta_rc_update(&local->hw, &sdata->vif, link_sta, changed); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params) { struct ieee80211_sub_if_data *sdata = link->sdata; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) return 0; if (params->cw_min == 0 || params->cw_min > params->cw_max) { /* * If we can't configure hardware anyway, don't warn. We may * never have initialized the CW parameters. */ WARN_ONCE(local->ops->conf_tx, "%s: invalid CW_min/CW_max: %d/%d\n", sdata->name, params->cw_min, params->cw_max); return -EINVAL; } trace_drv_conf_tx(local, sdata, link->link_id, ac, params); if (local->ops->conf_tx) ret = local->ops->conf_tx(&local->hw, &sdata->vif, link->link_id, ac, params); trace_drv_return_int(local, ret); return ret; } u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 ret = -1ULL; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return ret; trace_drv_get_tsf(local, sdata); if (local->ops->get_tsf) ret = local->ops->get_tsf(&local->hw, &sdata->vif); trace_drv_return_u64(local, ret); return ret; } void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_set_tsf(local, sdata, tsf); if (local->ops->set_tsf) local->ops->set_tsf(&local->hw, &sdata->vif, tsf); trace_drv_return_void(local); } void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_offset_tsf(local, sdata, offset); if (local->ops->offset_tsf) local->ops->offset_tsf(&local->hw, &sdata->vif, offset); trace_drv_return_void(local); } void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return; trace_drv_reset_tsf(local, sdata); if (local->ops->reset_tsf) local->ops->reset_tsf(&local->hw, &sdata->vif); trace_drv_return_void(local); } int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); /* * We should perhaps push emulate chanctx down and only * make it call ->config() when the chanctx is actually * assigned here (and unassigned below), but that's yet * another change to all drivers to add assign/unassign * emulation callbacks. Maybe later. */ if (sdata->vif.type == NL80211_IFTYPE_MONITOR && local->emulate_chanctx && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return 0; if (!check_sdata_in_driver(sdata)) return -EIO; if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return 0; trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->assign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_int(local, ret); return ret; } void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_MONITOR && local->emulate_chanctx && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) return; if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, link_conf->link_id)) return; trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->unassign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) { int ret = 0; int i; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->switch_vif_chanctx) return -EOPNOTSUPP; for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); WARN_ON_ONCE(!old_ctx->driver_present); WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && new_ctx->driver_present) || (mode == CHANCTX_SWMODE_REASSIGN_VIF && !new_ctx->driver_present)); } trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); ret = local->ops->switch_vif_chanctx(&local->hw, vifs, n_vifs, mode); trace_drv_return_int(local, ret); if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); new_ctx->driver_present = true; old_ctx->driver_present = false; } } return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); return ret; } void drv_link_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, int link_id, u64 changed) { might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED) && sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && sdata->vif.type != NL80211_IFTYPE_OCB)) return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.bss_conf.mu_mimo_owner && !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) return; if (!ieee80211_vif_link_active(&sdata->vif, link_id)) return; trace_drv_link_info_changed(local, sdata, info, changed); if (local->ops->link_info_changed) local->ops->link_info_changed(&local->hw, &sdata->vif, info, changed); else if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); trace_drv_return_void(local); } int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; if (WARN_ON(key->link_id >= 0 && sdata->vif.active_links && !(sdata->vif.active_links & BIT(key->link_id)))) return -ENOLINK; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); return ret; } int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_link_data *link; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_remove(link); } trace_drv_change_vif_links(local, sdata, old_links, new_links); if (local->ops->change_vif_links) ret = local->ops->change_vif_links(&local->hw, &sdata->vif, old_links, new_links, old); trace_drv_return_int(local, ret); if (ret) return ret; if (!local->in_reconfig && !local->resuming) { for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_add(link); } } return 0; } int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { struct sta_info *info = container_of(sta, struct sta_info, sta); struct link_sta_info *link_sta; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (!check_sdata_in_driver(sdata)) return -EIO; old_links &= sdata->vif.active_links; new_links &= sdata->vif.active_links; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); ieee80211_link_sta_debugfs_drv_remove(link_sta); } trace_drv_change_sta_links(local, sdata, sta, old_links, new_links); if (local->ops->change_sta_links) ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta, old_links, new_links); trace_drv_return_int(local, ret); if (ret) return ret; /* during reconfig don't add it to debugfs again */ if (local->in_reconfig || local->resuming) return 0; for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->hw.wiphy->mtx)); ieee80211_link_sta_debugfs_drv_add(link_sta); } return 0; } |
394 264 477 330 404 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_enqueue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb), TP_ARGS(qdisc, txq, skb), TP_STRUCT__entry( __field(struct Qdisc *, qdisc) __field(const struct netdev_queue *, txq) __field(void *, skbaddr) __field(int, ifindex) __field(u32, handle) __field(u32, parent) ), TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; ), TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ? qdisc_dev(q)->name : "(null)" ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q)->name ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev); __assign_str(kind); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
144 53 390 58 6 6 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { n->pprev = &n->next; n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif |
3 3 3 3 2 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLK_CGROUP_PRIVATE_H #define _BLK_CGROUP_PRIVATE_H /* * block cgroup private header * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> */ #include <linux/blk-cgroup.h> #include <linux/cgroup.h> #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> #include "blk.h" struct blkcg_gq; struct blkg_policy_data; /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { BLKG_IOSTAT_READ, BLKG_IOSTAT_WRITE, BLKG_IOSTAT_DISCARD, BLKG_IOSTAT_NR, }; struct blkg_iostat { u64 bytes[BLKG_IOSTAT_NR]; u64 ios[BLKG_IOSTAT_NR]; }; struct blkg_iostat_set { struct u64_stats_sync sync; struct blkcg_gq *blkg; struct llist_node lnode; int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ struct request_queue *q; struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; /* reference count */ struct percpu_ref refcnt; /* is this blkg online? protected by both blkcg and q locks */ bool online; struct blkg_iostat_set __percpu *iostat_cpu; struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spinlock_t async_bio_lock; struct bio_list async_bios; #endif union { struct work_struct async_bio_work; struct work_struct free_work; }; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; struct rcu_head rcu_head; }; struct blkcg { struct cgroup_subsys_state css; spinlock_t lock; refcount_t online_pin; /* If there is block congestion on this cgroup. */ atomic_t congestion_count; struct radix_tree_root blkg_tree; struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; /* * List of updated percpu blkg_iostat_set's since the last flush. */ struct llist_head __percpu *lhead; #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif }; static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * * There can be multiple active blkcg policies and each blkg:policy pair is * represented by a blkg_policy_data which is allocated and freed by each * policy's pd_alloc/free_fn() methods. A policy can allocate private data * area by allocating larger data structure which embeds blkg_policy_data * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; bool online; }; /* * Policies that need to keep per-blkcg data which is independent from any * request_queue associated to it should implement cpd_alloc/free_fn() * methods. A policy can allocate private data area by allocating larger * data structure which embeds blkcg_policy_data at the beginning. * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ struct blkcg *blkcg; int plid; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, struct seq_file *s); struct blkcg_policy { int plid; /* cgroup files for the policy */ struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; blkcg_pol_stat_pd_fn *pd_stat_fn; }; extern struct blkcg blkcg_root; extern bool blkcg_debug_stats; void blkg_init_queue(struct request_queue *q); int blkcg_init_disk(struct gendisk *disk); void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { char *input; char *body; struct block_device *bdev; struct blkcg_gq *blkg; }; void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx); void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg * @return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning * blkg. The idea is we do bio_blkcg_css() to look up the actual context for * the bio and attach the appropriate blkg to the bio. Then we call this helper * and if it is true run with the root blkg for that queue and then do any * backcharging to the originating cgroup once the io is complete. */ static inline bool bio_issue_as_root_blkg(struct bio *bio) { return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; } /** * blkg_lookup - lookup blkg for the specified blkcg - q pair * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; if (blkcg == &blkcg_root) return q->root_blkg; blkg = rcu_dereference_check(blkcg->blkg_hint, lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q != q) blkg = NULL; return blkg; } /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest * @pol: policy of interest * * Return pointer to private data associated with the @blkg-@pol pair. */ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return blkg ? blkg->pd[pol->plid] : NULL; } static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** * pdata_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. */ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return pd ? pd->blkg : NULL; } static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) { return cpd ? cpd->blkcg : NULL; } /** * blkg_get - get a blkg reference * @blkg: blkg to get * * The caller should be holding an existing reference. */ static inline void blkg_get(struct blkcg_gq *blkg) { percpu_ref_get(&blkg->refcnt); } /** * blkg_tryget - try and get a blkg reference * @blkg: blkg to get * * This is for use when doing an RCU lookup of the blkg. We may be in the midst * of freeing this blkg, so we can only use it if the refcnt is not zero. */ static inline bool blkg_tryget(struct blkcg_gq *blkg) { return blkg && percpu_ref_tryget(&blkg->refcnt); } /** * blkg_put - put a blkg reference * @blkg: blkg to put */ static inline void blkg_put(struct blkcg_gq *blkg) { percpu_ref_put(&blkg->refcnt); } /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. @p_blkg is * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q))) static inline void blkcg_bio_issue_init(struct bio *bio) { bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; if (atomic_add_return(1, &blkg->use_delay) == 1) atomic_inc(&blkg->blkcg->congestion_count); } static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); if (WARN_ON_ONCE(old < 0)) return 0; if (old == 0) return 0; /* * We do this song and dance because we can race with somebody else * adding or removing delay. If we just did an atomic_dec we'd end up * negative and we'd already be in trouble. We need to subtract 1 and * then check to see if we were the last delay so we can drop the * congestion count on the cgroup. */ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) ; if (old == 0) return 0; if (old == 1) atomic_dec(&blkg->blkcg->congestion_count); return 1; } /** * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount * @blkg: target blkg * @delay: delay duration in nsecs * * When enabled with this function, the delay is not decayed and must be * explicitly cleared with blkcg_clear_delay(). Must not be mixed with * blkcg_[un]use_delay() and blkcg_add_delay() usages. */ static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person setting the congestion count for this blkg. */ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) atomic_inc(&blkg->blkcg->congestion_count); atomic64_set(&blkg->delay_nsec, delay); } /** * blkcg_clear_delay - Disable allocator delay mechanism * @blkg: target blkg * * Disable use_delay mechanism. See blkcg_set_delay(). */ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) { int old = atomic_read(&blkg->use_delay); /* We only want 1 person clearing the congestion count for this blkg. */ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) atomic_dec(&blkg->blkcg->congestion_count); } /** * blk_cgroup_mergeable - Determine whether to allow or disallow merges * @rq: request to merge into * @bio: bio to merge * * @bio and @rq should belong to the same cgroup and their issue_as_root should * match. The latter is necessary as we don't want to throttle e.g. a metadata * update because it happens to be next to a regular IO. */ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return rq->bio->bi_blkg == bio->bi_blkg && bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); } void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); #else /* CONFIG_BLK_CGROUP */ struct blkg_policy_data { }; struct blkcg_policy_data { }; struct blkcg_policy { }; struct blkcg { }; static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline void blkg_init_queue(struct request_queue *q) { } static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_PRIVATE_H */ |
9 1 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match Hop-by-Hop and Destination parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_opts.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); MODULE_ALIAS("ip6t_dst"); /* * (Type & 0xC0) >> 6 * 0 -> ignorable * 1 -> must drop the packet * 2 -> send ICMP PARM PROB regardless and drop packet * 3 -> Send ICMP if not a multicast address and drop packet * (Type & 0x20) >> 5 * 0 -> invariant * 1 -> can change the routing * (Type & 0x1F) Type * 0 -> Pad1 (only 1 byte!) * 1 -> PadN LENGTH info (total length = length + 2) * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) * 5 -> RTALERT 2 x x */ static struct xt_match hbh_mt6_reg[] __read_mostly; static bool hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; u8 _opttype; u8 _optlen; const u_int8_t *tp = NULL; const u_int8_t *lp = NULL; unsigned int optlen; int err; err = ipv6_find_hdr(skb, &ptr, (par->match == &hbh_mt6_reg[0]) ? NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_optlen(oh); if (skb->len - ptr < hdrlen) { /* Packet smaller than it's length field */ return false; } pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); pr_debug("len %02X %04X %02X ", optinfo->hdrlen, hdrlen, (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); ret = (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); ptr += 2; hdrlen -= 2; if (!(optinfo->flags & IP6T_OPTS_OPTS)) { return ret; } else { pr_debug("Strict "); pr_debug("#%d ", optinfo->optsnr); for (temp = 0; temp < optinfo->optsnr; temp++) { /* type field exists ? */ if (hdrlen < 1) break; tp = skb_header_pointer(skb, ptr, sizeof(_opttype), &_opttype); if (tp == NULL) break; /* Type check */ if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) { pr_debug("Tbad %02X %02X\n", *tp, (optinfo->opts[temp] & 0xFF00) >> 8); return false; } else { pr_debug("Tok "); } /* Length check */ if (*tp) { u16 spec_len; /* length field exists ? */ if (hdrlen < 2) break; lp = skb_header_pointer(skb, ptr + 1, sizeof(_optlen), &_optlen); if (lp == NULL) break; spec_len = optinfo->opts[temp] & 0x00FF; if (spec_len != 0x00FF && spec_len != *lp) { pr_debug("Lbad %02X %04X\n", *lp, spec_len); return false; } pr_debug("Lok "); optlen = *lp + 2; } else { pr_debug("Pad1\n"); optlen = 1; } /* Step to the next */ pr_debug("len%04X\n", optlen); if ((ptr > skb->len - optlen || hdrlen < optlen) && temp < optinfo->optsnr - 1) { pr_debug("new pointer is too large!\n"); break; } ptr += optlen; hdrlen -= optlen; } if (temp == optinfo->optsnr) return ret; else return false; } return false; } static int hbh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { pr_debug("unknown flags %X\n", optsinfo->invflags); return -EINVAL; } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { pr_debug("Not strict - not implemented"); return -EINVAL; } return 0; } static struct xt_match hbh_mt6_reg[] __read_mostly = { { /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ .name = "hbh", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, { .name = "dst", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, }; static int __init hbh_mt6_init(void) { return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } static void __exit hbh_mt6_exit(void) { xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } module_init(hbh_mt6_init); module_exit(hbh_mt6_exit); |
1 18 18 3 18 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H #include <linux/blk-mq.h> #include "blk-stat.h" struct blk_mq_tag_set; struct blk_mq_ctxs { struct kobject kobj; struct blk_mq_ctx __percpu *queue_ctx; }; /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; struct list_head rq_lists[HCTX_MAX_TYPES]; } ____cacheline_aligned_in_smp; unsigned int cpu; unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; enum { BLK_MQ_NO_TAG = -1U, BLK_MQ_TAG_MIN = 1, BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; #define BLK_MQ_CPU_WORK_BATCH (8) typedef unsigned int __bitwise blk_insert_t; #define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) void blk_mq_submit_bio(struct bio *bio); int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth); void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); /* * CPU -> queue mappings */ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); /* * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue * @q: request queue * @type: the hctx type index * @cpu: CPU */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, enum hctx_type type, unsigned int cpu) { return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, blk_opf_t opf, struct blk_mq_ctx *ctx) { return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* * sysfs helpers */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); int blk_mq_sysfs_register(struct gendisk *disk); void blk_mq_sysfs_unregister(struct gendisk *disk); int blk_mq_sysfs_register_hctxs(struct request_queue *q); void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); void blk_mq_free_plug_rqs(struct blk_plug *plug); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { return per_cpu_ptr(q->queue_ctx, cpu); } /* * This assumes per-cpu software queueing queues. They could be per-node * as well, for instance. For now this is hardcoded as-is. Note that we don't * care about preemption, since we know the ctx's are persistent. This does * mean that we can't rely on ctx always matching the currently running CPU. */ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) { return __blk_mq_get_ctx(q, raw_smp_processor_id()); } struct blk_mq_alloc_data { /* input parameter */ struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; blk_opf_t cmd_flags; req_flags_t rq_flags; /* allocate multiple requests/tags in one go */ unsigned int nr_tags; struct rq_list *cached_rqs; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); void blk_mq_free_tags(struct blk_mq_tags *tags); int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset); void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, struct blk_mq_hw_ctx *hctx) { if (!hctx) return &bt->ws[0]; return sbq_wait_ptr(bt, &hctx->wait_index); } void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_tag_idle(hctx); } static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { return tag < tags->nr_reserved_tags; } static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) { /* Fast path: hardware queue is not stopped most of the time. */ if (likely(!test_bit(BLK_MQ_S_STOPPED, &hctx->state))) return false; /* * This barrier is used to order adding of dispatch list before and * the test of BLK_MQ_S_STOPPED below. Pairs with the memory barrier * in blk_mq_start_stopped_hw_queue() so that dispatch code could * either see BLK_MQ_S_STOPPED is cleared or dispatch list is not * empty to avoid missing dispatching requests. */ smp_mb(); return test_bit(BLK_MQ_S_STOPPED, &hctx->state); } static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) { return hctx->nr_ctx && hctx->tags; } unsigned int blk_mq_in_flight(struct request_queue *q, struct block_device *part); void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q, int budget_token) { if (q->mq_ops->put_budget) q->mq_ops->put_budget(q, budget_token); } static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { if (q->mq_ops->get_budget) return q->mq_ops->get_budget(q); return 0; } static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) { if (token < 0) return; if (rq->q->mq_ops->set_rq_budget_token) rq->q->mq_ops->set_rq_budget_token(rq, token); } static inline int blk_mq_get_rq_budget_token(struct request *rq) { if (rq->q->mq_ops->get_rq_budget_token) return rq->q->mq_ops->get_rq_budget_token(rq); return -1; } static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_add(val, &hctx->nr_active); } static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (blk_mq_is_shared_tags(hctx->flags)) atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); else atomic_sub(val, &hctx->nr_active); } static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { __blk_mq_sub_active_requests(hctx, 1); } static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_add_active_requests(hctx, val); } static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_inc_active_requests(hctx); } static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, int val) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_sub_active_requests(hctx, val); } static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) __blk_mq_dec_active_requests(hctx); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; } static inline void blk_mq_put_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) return; __blk_mq_put_driver_tag(rq->mq_hctx, rq); } bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) qmap->mq_map[cpu] = 0; } /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { while (!list_empty(list)) { struct request *rq = list_entry_rq(list->next); list_del_init(&rq->queuelist); blk_mq_free_request(rq); } } /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { unsigned int depth, users; if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) return true; /* * Don't try dividing an ant */ if (bt->sb.depth == 1) return true; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; } users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; /* * Allow at least some tags */ depth = max((bt->sb.depth + users - 1) / users, 4U); return __blk_mq_active_requests(hctx) < depth; } /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ srcu_idx = srcu_read_lock(__tag_set->srcu); \ (dispatch_ops); \ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ } else { \ rcu_read_lock(); \ (dispatch_ops); \ rcu_read_unlock(); \ } \ } while (0) #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ #endif |
662 189 659 938 940 937 588 550 581 8 8 1057 1092 96 95 70 1 69 100 99 101 3 140 143 143 98 52 4642 5120 5134 5120 2 5 4996 8 5011 2 2 818 818 815 298 296 33 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/mman.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> #include <linux/userfaultfd_k.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/personality.h> #include <linux/random.h> #include <linux/processor.h> #include <linux/sizes.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <kunit/visibility.h> #include "internal.h" #include "swap.h" /** * kfree_const - conditionally free memory * @x: pointer to the memory * * Function calls kfree only if @x is not in .rodata section. */ void kfree_const(const void *x) { if (!is_kernel_rodata((unsigned long)x)) kfree(x); } EXPORT_SYMBOL(kfree_const); /** * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated. * @s: The data to copy * @len: The size of the data, not including the NUL terminator * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp) { char *buf; /* '+1' for the NUL terminator */ buf = kmalloc_track_caller(len + 1, gfp); if (!buf) return NULL; memcpy(buf, s, len); /* Ensure the buf is always NUL-terminated, regardless of @s. */ buf[len] = '\0'; return buf; } /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s or %NULL in case of error */ noinline char *kstrdup(const char *s, gfp_t gfp) { return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL; } EXPORT_SYMBOL(kstrdup); /** * kstrdup_const - conditionally duplicate an existing const string * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Strings allocated by kstrdup_const should be freed by kfree_const and * must not be passed to krealloc(). * * Return: source string if it is in .rodata section otherwise * fallback to kstrdup. */ const char *kstrdup_const(const char *s, gfp_t gfp) { if (is_kernel_rodata((unsigned long)s)) return s; return kstrdup(s, gfp); } EXPORT_SYMBOL(kstrdup_const); /** * kstrndup - allocate space for and copy an existing string * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Note: Use kmemdup_nul() instead if the size is known exactly. * * Return: newly allocated copy of @s or %NULL in case of error */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL; } EXPORT_SYMBOL(kstrndup); /** * kmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kmemdup_noprof); /** * kmemdup_array - duplicate a given array. * * @src: array to duplicate. * @count: number of elements to duplicate from array. * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } EXPORT_SYMBOL(kmemdup_array); /** * kvmemdup - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length * @gfp: GFP mask to use * * Return: newly allocated copy of @src or %NULL in case of error, * result may be not physically contiguous. Use kvfree() to free. */ void *kvmemdup(const void *src, size_t len, gfp_t gfp) { void *p; p = kvmalloc(len, gfp); if (p) memcpy(p, src, len); return p; } EXPORT_SYMBOL(kvmemdup); /** * kmemdup_nul - Create a NUL-terminated string from unterminated data * @s: The data to stringify * @len: The size of the data * @gfp: the GFP mask used in the kmalloc() call when allocating memory * * Return: newly allocated copy of @s with NUL-termination or %NULL in * case of error */ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) { return s ? __kmemdup_nul(s, len, gfp) : NULL; } EXPORT_SYMBOL(kmemdup_nul); static kmem_buckets *user_buckets __ro_after_init; static int __init init_user_buckets(void) { user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); return 0; } subsys_initcall(init_user_buckets); /** * memdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result is physically * contiguous, to be freed by kfree(). */ void *memdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(memdup_user); /** * vmemdup_user - duplicate memory region from user space * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. Result may be not * physically contiguous. Use kvfree() to free. */ void *vmemdup_user(const void __user *src, size_t len) { void *p; p = kmem_buckets_valloc(user_buckets, len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kvfree(p); return ERR_PTR(-EFAULT); } return p; } EXPORT_SYMBOL(vmemdup_user); /** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { char *p; long length; length = strnlen_user(s, n); if (!length) return ERR_PTR(-EFAULT); if (length > n) return ERR_PTR(-EINVAL); p = memdup_user(s, length); if (IS_ERR(p)) return p; p[length - 1] = '\0'; return p; } EXPORT_SYMBOL(strndup_user); /** * memdup_user_nul - duplicate memory region from user space and NUL-terminate * * @src: source address in user space * @len: number of bytes to copy * * Return: an ERR_PTR() on failure. */ void *memdup_user_nul(const void __user *src, size_t len) { char *p; p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); if (copy_from_user(p, src, len)) { kfree(p); return ERR_PTR(-EFAULT); } p[len] = '\0'; return p; } EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } /* * Change backing file, only valid to use during initial VMA setup. */ void vma_set_file(struct vm_area_struct *vma, struct file *file) { /* Changing an anonymous vma with this is illegal */ get_file(file); swap(vma->vm_file, file); fput(file); } EXPORT_SYMBOL(vma_set_file); #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP return PAGE_ALIGN(stack_top) + random_variable; #else return PAGE_ALIGN(stack_top) - random_variable; #endif } /** * randomize_page - Generate a random, page aligned address * @start: The smallest acceptable address the caller will take. * @range: The size of the area, starting at @start, within which the * random address must fall. * * If @start + @range would overflow, @range is capped. * * NOTE: Historical use of randomize_range, which this replaces, presumed that * @start was already page aligned. We now align it regardless. * * Return: A page aligned address within [start, start + range). On error, * @start is returned. */ unsigned long randomize_page(unsigned long start, unsigned long range) { if (!PAGE_ALIGNED(start)) { range -= PAGE_ALIGN(start) - start; start = PAGE_ALIGN(start); } if (start > ULONG_MAX - range) range = ULONG_MAX - start; range >>= PAGE_SHIFT; if (range == 0) return start; return start + (get_random_long() % range << PAGE_SHIFT); } #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long __weak arch_randomize_brk(struct mm_struct *mm) { /* Is the current task 32bit ? */ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) return randomize_page(mm->brk, SZ_32M); return randomize_page(mm->brk, SZ_1G); } unsigned long arch_mmap_rnd(void) { unsigned long rnd; #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS if (is_compat_task()) rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); else #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } static int mmap_is_legacy(struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; /* On parisc the stack always grows up - so a unlimited stack should * not be an indicator to use the legacy memory layout. */ if (rlim_stack->rlim_cur == RLIM_INFINITY && !IS_ENABLED(CONFIG_STACK_GROWSUP)) return 1; return sysctl_legacy_va_layout; } /* * Leave enough space between the mmap area and the stack to honour ulimit in * the face of randomisation. */ #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* * For an upwards growing stack the calculation is much simpler. * Memory for the maximum stack size is reserved at the top of the * task. mmap_base starts directly below the stack and grows * downwards. */ return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd); #else unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_guard_gap; /* Account for stack randomization if necessary */ if (current->flags & PF_RANDOMIZE) pad += (STACK_RND_MASK << PAGE_SHIFT); /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) gap += pad; if (gap < MIN_GAP && MIN_GAP < MAX_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; return PAGE_ALIGN(STACK_TOP - gap - rnd); #endif } void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; clear_bit(MMF_TOPDOWN, &mm->flags); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); set_bit(MMF_TOPDOWN, &mm->flags); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; clear_bit(MMF_TOPDOWN, &mm->flags); } #endif #ifdef CONFIG_MMU EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); #endif /** * __account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * @task: task used to check RLIMIT_MEMLOCK * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped * * Assumes @task and @mm are valid (i.e. at least one reference on each), and * that mmap_lock is held as writer. * * Return: * * 0 on success * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; mmap_assert_write_locked(mm); locked_vm = mm->locked_vm; if (inc) { if (!bypass_rlim) { limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked_vm + pages > limit) ret = -ENOMEM; } if (!ret) mm->locked_vm = locked_vm + pages; } else { WARN_ON_ONCE(pages > locked_vm); mm->locked_vm = locked_vm - pages; } pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); return ret; } EXPORT_SYMBOL_GPL(__account_locked_vm); /** * account_locked_vm - account locked pages to an mm's locked_vm * @mm: mm to account against, may be NULL * @pages: number of pages to account * @inc: %true if @pages should be considered positive, %false if not * * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). * * Return: * * 0 on success, or if mm is NULL * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) { int ret; if (pages == 0 || !mm) return 0; mmap_write_lock(mm); ret = __account_locked_vm(mm, pages, inc, current, capable(CAP_IPC_LOCK)); mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL_GPL(account_locked_vm); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); ret = security_mmap_file(file, prot, flag); if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate, &uf); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(ret, populate); } return ret; } unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore * contribute to a long term fragmentation less than vmalloc fallback. * However make sure that larger requests are not too disruptive - no * OOM killer and no allocation failure warnings as we have a fallback. */ if (size > PAGE_SIZE) { flags |= __GFP_NOWARN; if (!(flags & __GFP_RETRY_MAYFAIL)) flags |= __GFP_NORETRY; /* nofail semantic is implemented by the vmalloc fallback */ flags &= ~__GFP_NOFAIL; } return flags; } /** * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Return: pointer to the allocated memory of %NULL in case of failure */ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { void *ret; /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_gfp_adjust(flags, size), node); if (ret || size <= PAGE_SIZE) return ret; /* non-sleeping allocations are not supported by vmalloc */ if (!gfpflags_allow_blocking(flags)) return NULL; /* Don't even allow crazy sizes */ if (unlikely(size > INT_MAX)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); return NULL; } /* * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, * since the callers already cannot assume anything * about the resulting pointer, and cannot play * protection games. */ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } EXPORT_SYMBOL(__kvmalloc_node_noprof); /** * kvfree() - Free memory. * @addr: Pointer to allocated memory. * * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). * It is slightly more efficient to use kfree() or vfree() if you are certain * that you know which one to use. * * Context: Either preemptible task context or not-NMI interrupt. */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) vfree(addr); else kfree(addr); } EXPORT_SYMBOL(kvfree); /** * kvfree_sensitive - Free a data object containing sensitive information. * @addr: address of the data object to be freed. * @len: length of the data object. * * Use the special memzero_explicit() function to clear the content of a * kvmalloc'ed object containing sensitive data to make sure that the * compiler won't optimize out the data clearing. */ void kvfree_sensitive(const void *addr, size_t len) { if (likely(!ZERO_OR_NULL_PTR(addr))) { memzero_explicit((void *)addr, len); kvfree(addr); } } EXPORT_SYMBOL(kvfree_sensitive); /** * kvrealloc - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate * @flags: the flags for the page level allocator * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * * This function must not be called concurrently with itself or kvfree() for the * same memory allocation. * * Return: pointer to the allocated memory or %NULL in case of error */ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) { void *n; if (is_vmalloc_addr(p)) return vrealloc_noprof(p, size, flags); n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ n = kvmalloc_noprof(size, flags); if (!n) return NULL; if (p) { /* We already know that `p` is not a vmalloc address. */ kasan_disable_current(); memcpy(n, kasan_reset_tag(p), ksize(p)); kasan_enable_current(); kfree(p); } } return n; } EXPORT_SYMBOL(kvrealloc_noprof); /** * __vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return __vmalloc_noprof(bytes, flags); } EXPORT_SYMBOL(__vmalloc_array_noprof); /** * vmalloc_array - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO); } EXPORT_SYMBOL(__vcalloc_noprof); /** * vcalloc - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vcalloc_noprof); struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) return NULL; return (void *)(mapping - PAGE_MAPPING_ANON); } /** * folio_mapping - Find the mapping where this folio is stored. * @folio: The folio. * * For folios which are in the page cache, return the mapping that this * page belongs to. Folios in the swap cache return the swap mapping * this page is stored in (which is different from the mapping for the * swap file or swap device where the data is stored). * * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ struct address_space *folio_mapping(struct folio *folio) { struct address_space *mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(folio_test_slab(folio))) return NULL; if (unlikely(folio_test_swapcache(folio))) return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; return mapping; } EXPORT_SYMBOL(folio_mapping); /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. * @src: Folio to copy from. * * The bytes in the folio represented by @src are copied to @dst. * Assumes the caller has validated that @dst is at least as large as @src. * Can be called in atomic context for order-0 folios, but if the folio is * larger, it may sleep. */ void folio_copy(struct folio *dst, struct folio *src) { long i = 0; long nr = folio_nr_pages(src); for (;;) { copy_highpage(folio_page(dst, i), folio_page(src, i)); if (++i == nr) break; cond_resched(); } } EXPORT_SYMBOL(folio_copy); int folio_mc_copy(struct folio *dst, struct folio *src) { long nr = folio_nr_pages(src); long i = 0; for (;;) { if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i))) return -EHWPOISON; if (++i == nr) break; cond_resched(); } return 0; } EXPORT_SYMBOL(folio_mc_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int overcommit_ratio_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_kbytes = 0; return ret; } static void sync_overcommit_as(struct work_struct *dummy) { percpu_counter_sync(&vm_committed_as); } int overcommit_policy_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int new_policy = -1; int ret; /* * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch * 2. sync percpu count on each CPU * 3. switch the policy */ if (write) { t = *table; t.data = &new_policy; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || new_policy == -1) return ret; mm_compute_batch(new_policy); if (new_policy == OVERCOMMIT_NEVER) schedule_on_each_cpu(sync_overcommit_as); sysctl_overcommit_memory = new_policy; } else { ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } return ret; } int overcommit_kbytes_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) sysctl_overcommit_ratio = 0; return ret; } /* * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used */ unsigned long vm_commit_limit(void) { unsigned long allowed; if (sysctl_overcommit_kbytes) allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); else allowed = ((totalram_pages() - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100); allowed += total_swap_pages; return allowed; } /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. */ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; /* * The global memory commitment made in the system can be a metric * that can be used to drive ballooning decisions when Linux is hosted * as a guest. On Hyper-V, the host implements a policy engine for dynamically * balancing memory across competing virtual machines that are hosted. * Several metrics drive this policy engine including the guest reported * memory commitment. * * The time cost of this is very low for small platforms, and for big * platform like a 2S/36C/72T Skylake server, in worst case where * vm_committed_as's spinlock is under severe contention, the time cost * could be about 30~40 microseconds. */ unsigned long vm_memory_committed(void) { return percpu_counter_sum_positive(&vm_committed_as); } EXPORT_SYMBOL_GPL(vm_memory_committed); /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. * * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. * * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; vm_acct_memory(pages); /* * Sometimes we want to use more memory than we have */ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { if (pages > totalram_pages() + total_swap_pages) goto error; return 0; } allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); /* * Don't let a single process grow so big a user can't recover */ if (mm) { long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); allowed -= min_t(long, mm->total_vm / 32, reserve); } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: bytes_failed = pages << PAGE_SHIFT; pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n", __func__, current->pid, current->comm, bytes_failed); vm_unacct_memory(pages); return -ENOMEM; } /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. * @buffer: the buffer to copy to. * @buflen: the length of the buffer. Larger cmdline values are truncated * to this length. * * Return: the size of the cmdline field copied. Note that the copy does * not guarantee an ending NULL byte. */ int get_cmdline(struct task_struct *task, char *buffer, int buflen) { int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; spin_unlock(&mm->arg_lock); len = arg_end - arg_start; if (len > buflen) len = buflen; res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); /* * If the nul at the end of args has been overwritten, then * assume application is using setproctitle(3). */ if (res > 0 && buffer[res-1] != '\0' && len < buflen) { len = strnlen(buffer, res); if (len < res) { res = len; } else { len = env_end - env_start; if (len > buflen - res) len = buflen - res; res += access_process_vm(task, env_start, buffer+res, len, FOLL_FORCE); res = strnlen(buffer, res); } } out_mm: mmput(mm); out: return res; } int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; addr1 = kmap_local_page(page1); addr2 = kmap_local_page(page2); ret = memcmp(addr1, addr2, PAGE_SIZE); kunmap_local(addr2); kunmap_local(addr1); return ret; } #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information * @object: object for which to find provenance information. * * This function uses pr_cont(), so that the caller is expected to have * printed out whatever preamble is appropriate. The provenance information * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation * and last free path of that object. */ void mem_dump_obj(void *object) { const char *type; if (kmem_dump_obj(object)) return; if (vmalloc_dump_obj(object)) return; if (is_vmalloc_addr(object)) type = "vmalloc memory"; else if (virt_addr_valid(object)) type = "non-slab/vmalloc memory"; else if (object == NULL) type = "NULL pointer"; else if (object == ZERO_SIZE_PTR) type = "zero-size pointer"; else type = "non-paged memory"; pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif /* * A driver might set a page logically offline -- PageOffline() -- and * turn the page inaccessible in the hypervisor; after that, access to page * content can be fatal. * * Some special PFN walkers -- i.e., /proc/kcore -- read content of random * pages after checking PageOffline(); however, these PFN walkers can race * with drivers that set PageOffline(). * * page_offline_freeze()/page_offline_thaw() allows for a subsystem to * synchronize with such drivers, achieving that a page cannot be set * PageOffline() while frozen. * * page_offline_begin()/page_offline_end() is used by drivers that care about * such races when setting a page PageOffline(). */ static DECLARE_RWSEM(page_offline_rwsem); void page_offline_freeze(void) { down_read(&page_offline_rwsem); } void page_offline_thaw(void) { up_read(&page_offline_rwsem); } void page_offline_begin(void) { down_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_begin); void page_offline_end(void) { up_write(&page_offline_rwsem); } EXPORT_SYMBOL(page_offline_end); #ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) flush_dcache_page(folio_page(folio, i)); } EXPORT_SYMBOL(flush_dcache_folio); #endif |
13 6 179 37 7 7 6 5 4 10 216 203 23 7 28 2 17 13 18 1 198 196 1 199 133 56 1 7 53 159 4 6 12 4 1 3 4 6 6 5 3 7 7 7 3 3 3 10 1 5 7 8 2 1 1 2 23 5 5 7 7 3 5 5 5 3 1 7 7 1 7 7 6 7 8 7 7 10 1 1 8 8 8 8 8 1 8 1 8 1 1 1 1 1 1 1 4 4 4 4 3 4 4 1 1 1 1 1 16 3 14 13 3 11 2 13 13 13 13 3 10 6 1 6 12 1 11 2 13 8 12 2 3 1 2 3 3 1 1 2 1 1 3 3 5 1 5 5 4 4 3 5 5 5 5 5 5 3 4 4 4 4 4 4 4 4 3 29 924 925 209 210 167 116 209 81 55 23 210 9 2 2 2 2 2 2 2 2 2 2 1 2 5 2 1 1 2 2 2 2 2 2 41 1 42 4 3 3 5 1 25 1 1 16 25 24 9 15 2 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 2 2 1 14 1 1 1 1 1 1 1 1 1 3 2 1 1 2 1 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 7 7 17 2 15 14 12 1 13 4 9 13 4 7 2 5 4 1 3 3 1 1 1 2 2 2 1 2 1 4 2 2 1 3 1 3 1 4 1 1 1 2 2 1 2 1 3 13 2 9 2 17 11 1 3 1 1 3 2 4 2 2 1 1 1 5 5 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the NetFilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/sysctl.h> #include <linux/proc_fs.h> #include <linux/workqueue.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/mutex.h> #include <net/net_namespace.h> #include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif #include <net/route.h> #include <net/sock.h> #include <net/genetlink.h> #include <linux/uaccess.h> #include <net/ip_vs.h> MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; int ip_vs_get_debug_level(void) { return sysctl_ip_vs_debug_level; } #endif /* Protos */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup); #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static bool __ip_vs_addr_is_local_v6(struct net *net, const struct in6_addr *addr) { struct flowi6 fl6 = { .daddr = *addr, }; struct dst_entry *dst = ip6_route_output(net, NULL, &fl6); bool is_local; is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK); dst_release(dst); return is_local; } #endif #ifdef CONFIG_SYSCTL /* * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; int availmem; int amemthresh; int nomem; int to_change = -1; /* we only count free and buffered memory (in pages) */ si_meminfo(&i); availmem = i.freeram + i.bufferram; /* however in linux 2.5 the i.bufferram is total page cache size, we need adjust it */ /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0); nomem = (availmem < amemthresh); local_bh_disable(); /* drop_entry */ spin_lock(&ipvs->dropentry_lock); switch (ipvs->sysctl_drop_entry) { case 0: atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { atomic_set(&ipvs->dropentry, 1); ipvs->sysctl_drop_entry = 2; } else { atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { atomic_set(&ipvs->dropentry, 1); } else { atomic_set(&ipvs->dropentry, 0); ipvs->sysctl_drop_entry = 1; } break; case 3: atomic_set(&ipvs->dropentry, 1); break; } spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ spin_lock(&ipvs->droppacket_lock); switch (ipvs->sysctl_drop_packet) { case 0: ipvs->drop_rate = 0; break; case 1: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; ipvs->sysctl_drop_packet = 2; } else { ipvs->drop_rate = 0; } break; case 2: if (nomem) { ipvs->drop_counter = amemthresh / (amemthresh - availmem); ipvs->drop_rate = ipvs->drop_counter; } else { ipvs->drop_rate = 0; ipvs->sysctl_drop_packet = 1; } break; case 3: ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ spin_lock(&ipvs->securetcp_lock); switch (ipvs->sysctl_secure_tcp) { case 0: if (ipvs->old_secure_tcp >= 2) to_change = 0; break; case 1: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; ipvs->sysctl_secure_tcp = 2; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; } break; case 2: if (nomem) { if (ipvs->old_secure_tcp < 2) to_change = 1; } else { if (ipvs->old_secure_tcp >= 2) to_change = 0; ipvs->sysctl_secure_tcp = 1; } break; case 3: if (ipvs->old_secure_tcp < 2) to_change = 1; break; } ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(ipvs, ipvs->sysctl_secure_tcp > 1); spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } /* Handler for delayed work for expiring no * destination connections */ static void expire_nodest_conn_handler(struct work_struct *work) { struct netns_ipvs *ipvs; ipvs = container_of(work, struct netns_ipvs, expire_nodest_conn_work.work); ip_vs_expire_nodest_conn_flush(ipvs); } /* * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ static void defense_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, defense_work.work); update_defense_level(ipvs); if (atomic_read(&ipvs->dropentry)) ip_vs_random_dropentry(ipvs); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); } #endif static void est_reload_work_handler(struct work_struct *work) { struct netns_ipvs *ipvs = container_of(work, struct netns_ipvs, est_reload_work.work); int genid_done = atomic_read(&ipvs->est_genid_done); unsigned long delay = HZ / 10; /* repeat startups after failure */ bool repeat = false; int genid; int id; mutex_lock(&ipvs->est_mutex); genid = atomic_read(&ipvs->est_genid); for (id = 0; id < ipvs->est_kt_count; id++) { struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ if (!ipvs->enable) goto unlock; if (!kd) continue; /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } atomic_set(&ipvs->est_genid_done, genid); if (repeat) queue_delayed_work(system_long_wq, &ipvs->est_reload_work, delay); unlock: mutex_unlock(&ipvs->est_mutex); } int ip_vs_use_count_inc(void) { return try_module_get(THIS_MODULE); } void ip_vs_use_count_dec(void) { module_put(THIS_MODULE); } /* * Hash table: for virtual service lookups */ #define IP_VS_SVC_TAB_BITS 8 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* the service table hashed by <protocol, addr, port> */ static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; /* * Returns hash value for virtual service */ static inline unsigned int ip_vs_svc_hashkey(struct netns_ipvs *ipvs, int af, unsigned int proto, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; __u32 ahash; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif ahash = ntohl(addr_fold); ahash ^= ((size_t) ipvs >> 8); return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) & IP_VS_SVC_TAB_MASK; } /* * Returns hash value of fwmark for virtual service lookup */ static inline unsigned int ip_vs_svc_fwm_hashkey(struct netns_ipvs *ipvs, __u32 fwmark) { return (((size_t)ipvs>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ static int ip_vs_svc_hash(struct ip_vs_service *svc) { unsigned int hash; if (svc->flags & IP_VS_SVC_F_HASHED) { pr_err("%s(): request for already hashed, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ hash = ip_vs_svc_hashkey(svc->ipvs, svc->af, svc->protocol, &svc->addr, svc->port); hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* * Hash it by fwmark in svc_fwm_table */ hash = ip_vs_svc_fwm_hashkey(svc->ipvs, svc->fwmark); hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } svc->flags |= IP_VS_SVC_F_HASHED; /* increase its refcnt because it is referenced by the svc table */ atomic_inc(&svc->refcnt); return 1; } /* * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) { if (!(svc->flags & IP_VS_SVC_F_HASHED)) { pr_err("%s(): request for unhash flagged, called from %pS\n", __func__, __builtin_return_address(0)); return 0; } if (svc->fwmark == 0) { /* Remove it from the svc_table table */ hlist_del_rcu(&svc->s_list); } else { /* Remove it from the svc_fwm_table table */ hlist_del_rcu(&svc->f_list); } svc->flags &= ~IP_VS_SVC_F_HASHED; atomic_dec(&svc->refcnt); return 1; } /* * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned int hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ hash = ip_vs_svc_hashkey(ipvs, af, protocol, vaddr, vport); hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) { if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) && (svc->protocol == protocol) && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark) { unsigned int hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ hash = ip_vs_svc_fwm_hashkey(ipvs, fwmark); hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af && (svc->ipvs == ipvs)) { /* HIT */ return svc; } } return NULL; } /* Find service, called under RCU lock */ struct ip_vs_service * ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; /* * Check the table hashed by fwmark first */ if (fwmark) { svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark); if (svc) goto out; } /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport); if (!svc && protocol == IPPROTO_TCP && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT); } if (svc == NULL && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0); } out: IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", fwmark, ip_vs_proto_name(protocol), IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), svc ? "hit" : "not hit"); return svc; } static inline void __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) { atomic_inc(&svc->refcnt); rcu_assign_pointer(dest->svc, svc); } static void ip_vs_service_free(struct ip_vs_service *svc) { ip_vs_stats_release(&svc->stats); kfree(svc); } static void ip_vs_service_rcu_free(struct rcu_head *head) { struct ip_vs_service *svc; svc = container_of(head, struct ip_vs_service, rcu_head); ip_vs_service_free(svc); } static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } /* * Returns hash value for real service */ static inline unsigned int ip_vs_rs_hashkey(int af, const union nf_inet_addr *addr, __be16 port) { unsigned int porth = ntohs(port); __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) & IP_VS_RTAB_MASK; } /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */ static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned int hash; __be16 port; if (dest->in_rs_table) return; switch (IP_VS_DFWD_METHOD(dest)) { case IP_VS_CONN_F_MASQ: port = dest->port; break; case IP_VS_CONN_F_TUNNEL: switch (dest->tun_type) { case IP_VS_CONN_F_TUNNEL_TYPE_GUE: port = dest->tun_port; break; case IP_VS_CONN_F_TUNNEL_TYPE_IPIP: case IP_VS_CONN_F_TUNNEL_TYPE_GRE: port = 0; break; default: return; } break; default: return; } /* * Hash by proto,addr,port, * which are the parameters of the real service. */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port); hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]); dest->in_rs_table = 1; } /* Unhash ip_vs_dest from rs_table. */ static void ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* * Remove it from the rs_table table. */ if (dest->in_rs_table) { hlist_del_rcu(&dest->d_list); dest->in_rs_table = 0; } } /* Check if real service by <proto,addr,port> is present */ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return true; } } return false; } /* Find real service record by <proto,addr,port>. * In case of multiple records with the same <proto,addr,port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { unsigned int hash; struct ip_vs_dest *dest; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, dport); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->port == dport && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->protocol == protocol || dest->vfwmark) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) { /* HIT */ return dest; } } return NULL; } /* Find real service record by <af,addr,tun_port>. * In case of multiple records with the same <af,addr,tun_port>, only * the first found record is returned. * * To be called under RCU lock. */ struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af, const union nf_inet_addr *daddr, __be16 tun_port) { struct ip_vs_dest *dest; unsigned int hash; /* Check for "full" addressed entries */ hash = ip_vs_rs_hashkey(af, daddr, tun_port); hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { if (dest->tun_port == tun_port && dest->af == af && ip_vs_addr_equal(af, &dest->addr, daddr) && IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) { /* HIT */ return dest; } } return NULL; } /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ static struct ip_vs_dest * ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; /* * Find the destination for the given service */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if ((dest->af == dest_af) && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && (dest->port == dport)) { /* HIT */ return dest; } } return NULL; } /* * Find destination by {daddr,dport,vaddr,protocol} * Created to be used in ip_vs_process_message() in * the backup synchronization daemon. It finds the * destination to be bound to the received connection * on the backup. * Called under RCU lock, no refcnt is returned. */ struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af, const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol, __u32 fwmark, __u32 flags) { struct ip_vs_dest *dest; struct ip_vs_service *svc; __be16 port = dport; svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) port = 0; dest = ip_vs_lookup_dest(svc, dest_af, daddr, port); if (!dest) dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport); return dest; } void ip_vs_dest_dst_rcu_free(struct rcu_head *head) { struct ip_vs_dest_dst *dest_dst = container_of(head, struct ip_vs_dest_dst, rcu_head); dst_release(dest_dst->dst_cache); kfree(dest_dst); } /* Release dest_dst and dst_cache for dest in user context */ static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest) { struct ip_vs_dest_dst *old; old = rcu_dereference_protected(dest->dest_dst, 1); if (old) { RCU_INIT_POINTER(dest->dest_dst, NULL); call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); } } /* * Lookup dest by {svc,addr,port} in the destination trash. * The destination trash is used to hold the destinations that are removed * from the service table but are still referenced by some conn entries. * The reason to add the destination trash is when the dest is temporary * down (either by administrator or by monitor program), the dest can be * picked back from the trash, the remaining connections to the dest can * continue, and the counting information of the dest is also useful for * scheduling. */ static struct ip_vs_dest * ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest; struct netns_ipvs *ipvs = svc->ipvs; /* * Find the destination in trash */ spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (dest->af == dest_af && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && dest->vfwmark == svc->fwmark && dest->protocol == svc->protocol && (svc->fwmark || (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && dest->vport == svc->port))) { /* HIT */ list_del(&dest->t_list); goto out; } } dest = NULL; out: spin_unlock_bh(&ipvs->dest_trash_lock); return dest; } static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; dest = container_of(head, struct ip_vs_dest, rcu_head); ip_vs_stats_release(&dest->stats); ip_vs_dest_put_and_free(dest); } static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); __ip_vs_svc_put(svc); call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* * Clean up all the destinations in the trash * Called by the ip_vs_control_cleanup() * * When the ip_vs_control_clearup is activated by ipvs module exit, * the service tables must have been flushed and all the connections * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; del_timer_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); ip_vs_dest_free(dest); } } static void ip_vs_stats_rcu_free(struct rcu_head *head) { struct ip_vs_stats_rcu *rs = container_of(head, struct ip_vs_stats_rcu, rcu_head); ip_vs_stats_release(&rs->s); kfree(rs); } static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); IP_VS_SHOW_STATS_COUNTER(outpkts); IP_VS_SHOW_STATS_COUNTER(inbytes); IP_VS_SHOW_STATS_COUNTER(outbytes); ip_vs_read_estimator(dst, src); spin_unlock(&src->lock); } static void ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) { dst->conns = (u32)src->conns; dst->inpkts = (u32)src->inpkts; dst->outpkts = (u32)src->outpkts; dst->inbytes = src->inbytes; dst->outbytes = src->outbytes; dst->cps = (u32)src->cps; dst->inpps = (u32)src->inpps; dst->outpps = (u32)src->outpps; dst->inbps = (u32)src->inbps; dst->outbps = (u32)src->outbps; } static void ip_vs_zero_stats(struct ip_vs_stats *stats) { spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c IP_VS_ZERO_STATS_COUNTER(conns); IP_VS_ZERO_STATS_COUNTER(inpkts); IP_VS_ZERO_STATS_COUNTER(outpkts); IP_VS_ZERO_STATS_COUNTER(inbytes); IP_VS_ZERO_STATS_COUNTER(outbytes); ip_vs_zero_estimator(stats); spin_unlock(&stats->lock); } /* Allocate fields after kzalloc */ int ip_vs_stats_init_alloc(struct ip_vs_stats *s) { int i; spin_lock_init(&s->lock); s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); if (!s->cpustats) return -ENOMEM; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); u64_stats_init(&cs->syncp); } return 0; } struct ip_vs_stats *ip_vs_stats_alloc(void) { struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); if (s && ip_vs_stats_init_alloc(s) >= 0) return s; kfree(s); return NULL; } void ip_vs_stats_release(struct ip_vs_stats *stats) { free_percpu(stats->cpustats); } void ip_vs_stats_free(struct ip_vs_stats *stats) { if (stats) { ip_vs_stats_release(stats); kfree(stats); } } /* * Update a destination in the given service */ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { struct netns_ipvs *ipvs = svc->ipvs; struct ip_vs_service *old_svc; struct ip_vs_scheduler *sched; int conn_flags; /* We cannot modify an address and change the address family */ BUG_ON(!add && udest->af != dest->af); if (add && udest->af != svc->af) ipvs->mixed_address_family_dests++; /* keep the last_weight with latest non-0 weight */ if (add || udest->weight != 0) atomic_set(&dest->last_weight, udest->weight); /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; /* Need to rehash? */ if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_DFWD_METHOD(dest) || udest->tun_type != dest->tun_type || udest->tun_port != dest->tun_port) ip_vs_rs_unhash(dest); /* set the tunnel info */ dest->tun_type = udest->tun_type; dest->tun_port = udest->tun_port; dest->tun_flags = udest->tun_flags; /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* FTP-NAT requires conntrack for mangling */ if (svc->port == FTPPORT) ip_vs_register_conntrack(svc); } atomic_set(&dest->conn_flags, conn_flags); /* Put the real service in rs_table if not present. */ ip_vs_rs_hash(ipvs, dest); /* bind the service */ old_svc = rcu_dereference_protected(dest->svc, 1); if (!old_svc) { __ip_vs_bind_svc(dest, svc); } else { if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); __ip_vs_svc_put(old_svc); } } /* set the dest status flags */ dest->flags |= IP_VS_DEST_F_AVAILABLE; if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; dest->af = udest->af; spin_lock_bh(&dest->dst_lock); __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); if (add) { list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } /* * Create a destination for the given service */ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; unsigned int atype; int ret; #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6)) return -EINVAL; ret = nf_defrag_ipv6_enable(svc->ipvs->net); if (ret) return ret; } else #endif { atype = inet_addr_type(svc->ipvs->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) return -ENOMEM; ret = ip_vs_stats_init_alloc(&dest->stats); if (ret < 0) goto err_alloc; ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) goto err_stats; dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; dest->vport = svc->port; dest->vfwmark = svc->fwmark; ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr); dest->port = udest->port; atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); refcount_set(&dest->refcnt, 1); INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); __ip_vs_update_dest(svc, dest, udest, 1); return 0; err_stats: ip_vs_stats_release(&dest->stats); err_alloc: kfree(dest); return ret; } /* * Add a destination into an existing service */ static int ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; int ret; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest != NULL) { IP_VS_DBG(1, "%s(): dest already exists\n", __func__); return -EEXIST; } /* * Check if the dest already exists in the trash and * is from the same service */ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport); if (dest != NULL) { IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), refcount_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); if (ret < 0) return ret; __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } return ret; } /* * Edit a destination in the given service */ static int ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; union nf_inet_addr daddr; __be16 dport = udest->port; if (udest->weight < 0) { pr_err("%s(): server weight less than zero\n", __func__); return -ERANGE; } if (udest->l_threshold > udest->u_threshold) { pr_err("%s(): lower threshold is higher than upper threshold\n", __func__); return -ERANGE; } if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { if (udest->tun_port == 0) { pr_err("%s(): tunnel port is zero\n", __func__); return -EINVAL; } } ip_vs_addr_copy(udest->af, &daddr, &udest->addr); /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__); return -ENOENT; } __ip_vs_update_dest(svc, dest, udest, 0); return 0; } /* * Delete a destination (must be already unlinked from the service) */ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, bool cleanup) { ip_vs_stop_estimator(ipvs, &dest->stats); /* * Remove it from the d-linked list with the real services. */ ip_vs_rs_unhash(dest); spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); /* dest lives in trash with reference */ list_add(&dest->t_list, &ipvs->dest_trash); dest->idle_start = 0; spin_unlock_bh(&ipvs->dest_trash_lock); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. */ if (!cleanup) ip_vs_enqueue_expire_nodest_conns(ipvs); } /* * Unlink a destination from the given service */ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, int svcupd) { dest->flags &= ~IP_VS_DEST_F_AVAILABLE; /* * Remove it from the d-linked destination list. */ list_del_rcu(&dest->n_list); svc->num_dests--; if (dest->af != svc->af) svc->ipvs->mixed_address_family_dests--; if (svcupd) { struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); if (sched && sched->del_dest) sched->del_dest(svc, dest); } } /* * Delete a destination server in the given service */ static int ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; __be16 dport = udest->port; /* We use function that requires RCU lock */ rcu_read_lock(); dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport); rcu_read_unlock(); if (dest == NULL) { IP_VS_DBG(1, "%s(): destination not found!\n", __func__); return -ENOENT; } /* * Unlink dest from the service */ __ip_vs_unlink_dest(svc, dest, 1); /* * Delete the destination */ __ip_vs_del_dest(svc->ipvs, dest, false); return 0; } static void ip_vs_dest_trash_expire(struct timer_list *t) { struct netns_ipvs *ipvs = from_timer(ipvs, t, dest_trash_timer); struct ip_vs_dest *dest, *next; unsigned long now = jiffies; spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { if (refcount_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + IP_VS_DEST_TRASH_PERIOD)) continue; } else { dest->idle_start = max(1UL, now); continue; } IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); list_del(&dest->t_list); ip_vs_dest_free(dest); } if (!list_empty(&ipvs->dest_trash)) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); spin_unlock(&ipvs->dest_trash_lock); } /* * Add a service into the service hash table */ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; int ret_hooks = -1; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOPROTOOPT; /* Lookup the scheduler by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; goto out_err; } } if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out_err; } } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out_err; } ret = nf_defrag_ipv6_enable(ipvs->net); if (ret) goto out_err; } #endif if ((u->af == AF_INET && !ipvs->num_services) || (u->af == AF_INET6 && !ipvs->num_services6)) { ret = ip_vs_register_hooks(ipvs, u->af); if (ret < 0) goto out_err; ret_hooks = ret; } svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; goto out_err; } ret = ip_vs_stats_init_alloc(&svc->stats); if (ret < 0) goto out_err; /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); svc->af = u->af; svc->protocol = u->protocol; ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs; INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); /* Bind the scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) goto out_err; sched = NULL; } ret = ip_vs_start_estimator(ipvs, &svc->stats); if (ret < 0) goto out_err; /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; else if (svc->af == AF_INET6) ipvs->num_services6++; /* Hash the service into the service table */ ip_vs_svc_hash(svc); *svc_p = svc; if (!ipvs->enable) { /* Now there is a service - full throttle */ ipvs->enable = 1; /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); } return 0; out_err: if (ret_hooks >= 0) ip_vs_unregister_hooks(ipvs, u->af); if (svc != NULL) { ip_vs_unbind_scheduler(svc, sched); ip_vs_service_free(svc); } ip_vs_scheduler_put(sched); ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); return ret; } /* * Edit a service and bind it with a new scheduler */ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' */ if (strcmp(u->sched_name, "none")) { sched = ip_vs_scheduler_get(u->sched_name); if (!sched) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); return -ENOENT; } } old_sched = sched; if (u->pe_name && *u->pe_name) { pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); ret = -ENOENT; goto out; } old_pe = pe; } #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6) { __u32 plen = (__force __u32) u->netmask; if (plen < 1 || plen > 128) { ret = -EINVAL; goto out; } } #endif old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { if (old_sched) { ip_vs_unbind_scheduler(svc, old_sched); RCU_INIT_POINTER(svc->scheduler, NULL); /* Wait all svc->sched_data users */ synchronize_rcu(); } /* Bind the new scheduler */ if (sched) { ret = ip_vs_bind_scheduler(svc, sched); if (ret) { ip_vs_scheduler_put(sched); goto out; } } } /* * Set the flags and timeout value */ svc->flags = u->flags | IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); /* check for optional methods in new pe */ new_pe_conn_out = (pe && pe->conn_out) ? true : false; old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; if (new_pe_conn_out && !old_pe_conn_out) atomic_inc(&svc->ipvs->conn_out_counter); if (old_pe_conn_out && !new_pe_conn_out) atomic_dec(&svc->ipvs->conn_out_counter); } out: ip_vs_scheduler_put(old_sched); ip_vs_pe_put(old_pe); return ret; } /* * Delete a service from the service list * - The service must be unlinked, unlocked and not referenced! * - We are called under _bh lock */ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; struct netns_ipvs *ipvs = svc->ipvs; if (svc->af == AF_INET) { ipvs->num_services--; if (!ipvs->num_services) ip_vs_unregister_hooks(ipvs, svc->af); } else if (svc->af == AF_INET6) { ipvs->num_services6--; if (!ipvs->num_services6) ip_vs_unregister_hooks(ipvs, svc->af); } ip_vs_stop_estimator(svc->ipvs, &svc->stats); /* Unbind scheduler */ old_sched = rcu_dereference_protected(svc->scheduler, 1); ip_vs_unbind_scheduler(svc, old_sched); ip_vs_scheduler_put(old_sched); /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); if (old_pe && old_pe->conn_out) atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* * Unlink the whole destination list */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); __ip_vs_del_dest(svc->ipvs, dest, cleanup); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it */ __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); } /* * Unlink a service from list and try to delete it if its refcnt reached 0 */ static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup) { ip_vs_unregister_conntrack(svc); /* Hold svc to avoid double release from dest_trash */ atomic_inc(&svc->refcnt); /* * Unhash it from the service table */ ip_vs_svc_unhash(svc); __ip_vs_del_service(svc, cleanup); } /* * Delete a service from the service list */ static int ip_vs_del_service(struct ip_vs_service *svc) { if (svc == NULL) return -EEXIST; ip_vs_unlink_service(svc, false); return 0; } /* * Flush all the virtual services */ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) { int idx; struct ip_vs_service *svc; struct hlist_node *n; /* * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } /* * Flush the service table hashed by fwmark */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_unlink_service(svc, cleanup); } } return 0; } /* * Delete service by {netns} in the service table. * Called by __ip_vs_batch_cleanup() */ void ip_vs_service_nets_cleanup(struct list_head *net_list) { struct netns_ipvs *ipvs; struct net *net; /* Check for "full" addressed entries */ mutex_lock(&__ip_vs_mutex); list_for_each_entry(net, net_list, exit_list) { ipvs = net_ipvs(net); ip_vs_flush(ipvs, true); } mutex_unlock(&__ip_vs_mutex); } /* Put all references for device (dst_cache) */ static inline void ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) { struct ip_vs_dest_dst *dest_dst; spin_lock_bh(&dest->dst_lock); dest_dst = rcu_dereference_protected(dest->dest_dst, 1); if (dest_dst && dest_dst->dst_cache->dev == dev) { IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), refcount_read(&dest->refcnt)); __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); } /* Netdev event receiver * Currently only NETDEV_DOWN is handled to release refs to cached dsts */ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_service *svc; struct ip_vs_dest *dest; unsigned int idx; if (event != NETDEV_DOWN || !ipvs) return NOTIFY_DONE; IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); mutex_lock(&__ip_vs_mutex); for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) { list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_forget_dev(dest, dev); } } } } spin_lock_bh(&ipvs->dest_trash_lock); list_for_each_entry(dest, &ipvs->dest_trash, t_list) { ip_vs_forget_dev(dest, dev); } spin_unlock_bh(&ipvs->dest_trash_lock); mutex_unlock(&__ip_vs_mutex); return NOTIFY_DONE; } /* * Zero counters in a service or all services */ static int ip_vs_zero_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest; list_for_each_entry(dest, &svc->destinations, n_list) { ip_vs_zero_stats(&dest->stats); } ip_vs_zero_stats(&svc->stats); return 0; } static int ip_vs_zero_all(struct netns_ipvs *ipvs) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { if (svc->ipvs == ipvs) ip_vs_zero_service(svc); } } ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } #ifdef CONFIG_SYSCTL static int proc_do_defense_mode(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 0 || val > 3) { rc = -EINVAL; } else { *valp = val; update_defense_level(ipvs); } } return rc; } static int proc_do_sync_threshold(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val[2]; int rc; struct ctl_table tmp = { .data = &val, .maxlen = table->maxlen, .mode = table->mode, }; mutex_lock(&ipvs->sync_mutex); memcpy(val, valp, sizeof(val)); rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (val[0] < 0 || val[1] < 0 || (val[0] >= val[1] && val[1])) rc = -EINVAL; else memcpy(valp, val, sizeof(val)); } mutex_unlock(&ipvs->sync_mutex); return rc; } static int proc_do_sync_ports(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; rc = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write && (*valp != val)) { if (val < 1 || !is_power_of_2(val)) rc = -EINVAL; else *valp = val; } return rc; } static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, void *buffer) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; cpumask_var_t newmask; int ret; if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) return -ENOMEM; ret = cpulist_parse(buffer, newmask); if (ret) goto out; mutex_lock(&ipvs->est_mutex); if (!ipvs->est_cpulist_valid) { if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { ret = -ENOMEM; goto unlock; } ipvs->est_cpulist_valid = 1; } cpumask_and(newmask, newmask, ¤t->cpus_mask); cpumask_copy(*valp, newmask); /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; ip_vs_est_reload_start(ipvs); unlock: mutex_unlock(&ipvs->est_mutex); out: free_cpumask_var(newmask); return ret; } static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, void *buffer, size_t size) { struct netns_ipvs *ipvs = table->extra2; cpumask_var_t *valp = table->data; struct cpumask *mask; int ret; mutex_lock(&ipvs->est_mutex); if (ipvs->est_cpulist_valid) mask = *valp; else mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); mutex_unlock(&ipvs->est_mutex); return ret; } static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; /* Ignore both read and write(append) if *ppos not 0 */ if (*ppos || !*lenp) { *lenp = 0; return 0; } if (write) { /* proc_sys_call_handler() appends terminator */ ret = ipvs_proc_est_cpumask_set(table, buffer); if (ret >= 0) *ppos += *lenp; } else { /* proc_sys_call_handler() allocates 1 byte for terminator */ ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); if (ret >= 0) { *lenp = ret; *ppos += *lenp; ret = 0; } } return ret; } static int ipvs_proc_est_nice(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { if (val < MIN_NICE || val > MAX_NICE) { ret = -EINVAL; } else { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } } return ret; } static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; int val = *valp; int ret; struct ctl_table tmp_table = { .data = &val, .maxlen = sizeof(int), .mode = table->mode, }; ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); if (write && ret >= 0) { mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; ip_vs_est_reload_start(ipvs); } mutex_unlock(&ipvs->est_mutex); } return ret; } /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without * align with netns init in ip_vs_control_net_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "am_droprate", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, #ifdef CONFIG_IP_VS_NFCT { .procname = "conntrack", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, #endif { .procname = "secure_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "snat_reroute", .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "sync_ports", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_sync_ports, }, { .procname = "sync_persist_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_qlen_max", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "sync_sock_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cache_bypass", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_nodest_conn", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_tcp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sloppy_sctp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "expire_quiescent_template", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sync_threshold", .maxlen = sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), .mode = 0644, .proc_handler = proc_do_sync_threshold, }, { .procname = "sync_refresh_period", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "sync_retries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "nat_icmp_send", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "pmtu_disc", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "backup_only", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "conn_reuse_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "schedule_icmp", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ignore_tunneled", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_run_estimation, }, { .procname = "est_cpulist", .maxlen = NR_CPUS, /* unused */ .mode = 0644, .proc_handler = ipvs_proc_est_cpulist, }, { .procname = "est_nice", .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipvs_proc_est_nice, }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", .data = &sysctl_ip_vs_debug_level, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; #endif #ifdef CONFIG_PROC_FS struct ip_vs_iter { struct seq_net_private p; /* Do not move this, netns depends upon it*/ struct hlist_head *table; int bucket; }; /* * Write the contents of the VS rule table to a PROCfs file. * (It is kept just for backward compatibility) */ static inline const char *ip_vs_fwd_name(unsigned int flags) { switch (flags & IP_VS_CONN_F_FWD_MASK) { case IP_VS_CONN_F_LOCALNODE: return "Local"; case IP_VS_CONN_F_TUNNEL: return "Tunnel"; case IP_VS_CONN_F_DROUTE: return "Route"; default: return "Masq"; } } /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; } } } /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx], f_list) { if ((svc->ipvs == ipvs) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; } } } return NULL; } static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct hlist_node *e; struct ip_vs_iter *iter; struct ip_vs_service *svc; ++*pos; if (v == SEQ_START_TOKEN) return ip_vs_info_array(seq,0); svc = v; iter = seq->private; if (iter->table == ip_vs_svc_table) { /* next service in table hashed by protocol */ e = rcu_dereference(hlist_next_rcu(&svc->s_list)); if (e) return hlist_entry(e, struct ip_vs_service, s_list); while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[iter->bucket], s_list) { return svc; } } iter->table = ip_vs_svc_fwm_table; iter->bucket = -1; goto scan_fwmark; } /* next service in hashed by fwmark */ e = rcu_dereference(hlist_next_rcu(&svc->f_list)); if (e) return hlist_entry(e, struct ip_vs_service, f_list); scan_fwmark: while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[iter->bucket], f_list) return svc; } return NULL; } static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip_vs_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "IP Virtual Server version %d.%d.%d (size=%d)\n", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); seq_puts(seq, "Prot LocalAddress:Port Scheduler Flags\n"); seq_puts(seq, " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); } else { struct net *net = seq_file_net(seq); struct netns_ipvs *ipvs = net_ipvs(net); const struct ip_vs_service *svc = v; const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; if (svc->ipvs != ipvs) return 0; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) seq_printf(seq, "%s [%pI6]:%04X %s ", ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } if (svc->flags & IP_VS_SVC_F_PERSISTENT) seq_printf(seq, "persistent %d %08X\n", svc->timeout, ntohl(svc->netmask)); else seq_putc(seq, '\n'); list_for_each_entry_rcu(dest, &svc->destinations, n_list) { #ifdef CONFIG_IP_VS_IPV6 if (dest->af == AF_INET6) seq_printf(seq, " -> [%pI6]:%04X" " %-7s %-6d %-10d %-10d\n", &dest->addr.in6, ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); else #endif seq_printf(seq, " -> %08X:%04X " "%-7s %-6d %-10d %-10d\n", ntohl(dest->addr.ip), ntohs(dest->port), ip_vs_fwd_name(atomic_read(&dest->conn_flags)), atomic_read(&dest->weight), atomic_read(&dest->activeconns), atomic_read(&dest->inactconns)); } } return 0; } static const struct seq_operations ip_vs_info_seq_ops = { .start = ip_vs_info_seq_start, .next = ip_vs_info_seq_next, .stop = ip_vs_info_seq_stop, .show = ip_vs_info_seq_show, }; static int ip_vs_stats_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_kstats show; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, (unsigned long long)show.outpkts, (unsigned long long)show.inbytes, (unsigned long long)show.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n", (unsigned long long)show.cps, (unsigned long long)show.inpps, (unsigned long long)show.outpps, (unsigned long long)show.inbps, (unsigned long long)show.outbps); return 0; } static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); seq_puts(seq, "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; do { start = u64_stats_fetch_begin(&u->syncp); conns = u64_stats_read(&u->cnt.conns); inpkts = u64_stats_read(&u->cnt.inpkts); outpkts = u64_stats_read(&u->cnt.outpkts); inbytes = u64_stats_read(&u->cnt.inbytes); outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, (u64)outpkts, (u64)inbytes, (u64)outbytes); } ip_vs_copy_stats(&kstats, tot_stats); seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)kstats.conns, (unsigned long long)kstats.inpkts, (unsigned long long)kstats.outpkts, (unsigned long long)kstats.inbytes, (unsigned long long)kstats.outbytes); /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n", kstats.cps, kstats.inpps, kstats.outpps, kstats.inbps, kstats.outbps); return 0; } #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, u->udp_timeout); #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) || u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) { return -EINVAL; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ)) return -EINVAL; #endif #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif return 0; } #define CMDID(cmd) (cmd - IP_VS_BASE_CTL) struct ip_vs_svcdest_user { struct ip_vs_service_user s; struct ip_vs_dest_user d; }; static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = { [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user), [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user), [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user), [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user), }; union ip_vs_set_arglen { struct ip_vs_service_user field_IP_VS_SO_SET_ADD; struct ip_vs_service_user field_IP_VS_SO_SET_EDIT; struct ip_vs_service_user field_IP_VS_SO_SET_DEL; struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST; struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST; struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON; struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON; struct ip_vs_service_user field_IP_VS_SO_SET_ZERO; }; #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen) static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { memset(usvc, 0, sizeof(*usvc)); usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; usvc->port = usvc_compat->port; usvc->fwmark = usvc_compat->fwmark; /* Deep copy of sched_name is not needed here */ usvc->sched_name = usvc_compat->sched_name; usvc->flags = usvc_compat->flags; usvc->timeout = usvc_compat->timeout; usvc->netmask = usvc_compat->netmask; } static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { memset(udest, 0, sizeof(*udest)); udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; udest->weight = udest_compat->weight; udest->u_threshold = udest_compat->u_threshold; udest->l_threshold = udest_compat->l_threshold; udest->af = AF_INET; udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP; } static int do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len) { struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_SET_ARGLEN]; struct ip_vs_service_user *usvc_compat; struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; struct ip_vs_dest_user *udest_compat; struct ip_vs_dest_user_kern udest; struct netns_ipvs *ipvs = net_ipvs(net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX) return -EINVAL; if (len != set_arglen[CMDID(cmd)]) { IP_VS_DBG(1, "set_ctl: len %u != %u\n", len, set_arglen[CMDID(cmd)]); return -EINVAL; } if (copy_from_sockptr(arg, ptr, len) != 0) return -EFAULT; /* Handle daemons since they have another lock */ if (cmd == IP_VS_SO_SET_STARTDAEMON || cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ipvs_sync_daemon_cfg cfg; memset(&cfg, 0, sizeof(cfg)); ret = -EINVAL; if (strscpy(cfg.mcast_ifn, dm->mcast_ifn, sizeof(cfg.mcast_ifn)) <= 0) return ret; cfg.syncid = dm->syncid; ret = start_sync_thread(ipvs, &cfg, dm->state); } else { ret = stop_sync_thread(ipvs, dm->state); } return ret; } mutex_lock(&__ip_vs_mutex); if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ ret = ip_vs_flush(ipvs, false); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (!len) { /* No more commands with len == 0 below */ ret = -EINVAL; goto out_unlock; } usvc_compat = (struct ip_vs_service_user *)arg; udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); /* We only use the new structs internally, so copy userspace compat * structs to extended internal versions */ ip_vs_copy_usvc_compat(&usvc, usvc_compat); ip_vs_copy_udest_compat(&udest, udest_compat); if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { ret = ip_vs_zero_all(ipvs); goto out_unlock; } } if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) && strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) == IP_VS_SCHEDNAME_MAXLEN) { ret = -EINVAL; goto out_unlock; } /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP && usvc.protocol != IPPROTO_SCTP) { pr_err("set_ctl: invalid protocol: %d %pI4:%d\n", usvc.protocol, &usvc.addr.ip, ntohs(usvc.port)); ret = -EFAULT; goto out_unlock; } /* Lookup the exact service by <protocol, addr, port> or fwmark */ rcu_read_lock(); if (usvc.fwmark == 0) svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark); rcu_read_unlock(); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; goto out_unlock; } switch (cmd) { case IP_VS_SO_SET_ADD: if (svc != NULL) ret = -EEXIST; else ret = ip_vs_add_service(ipvs, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); break; case IP_VS_SO_SET_DEL: ret = ip_vs_del_service(svc); if (!ret) goto out_unlock; break; case IP_VS_SO_SET_ZERO: ret = ip_vs_zero_service(svc); break; case IP_VS_SO_SET_ADDDEST: ret = ip_vs_add_dest(svc, &udest); break; case IP_VS_SO_SET_EDITDEST: ret = ip_vs_edit_dest(svc, &udest); break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); break; default: WARN_ON_ONCE(1); ret = -EINVAL; break; } out_unlock: mutex_unlock(&__ip_vs_mutex); return ret; } static void ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; dst->num_dests = src->num_dests; ip_vs_copy_stats(&kstats, &src->stats); ip_vs_export_stats_user(&dst->stats, &kstats); } static inline int __ip_vs_get_service_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; struct ip_vs_service *svc; struct ip_vs_service_entry entry; int ret = 0; for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ if (svc->af != AF_INET || (svc->ipvs != ipvs)) continue; if (count >= get->num_services) goto out; memset(&entry, 0, sizeof(entry)); ip_vs_copy_service(&entry, svc); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; goto out; } count++; } } out: return ret; } static inline int __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; union nf_inet_addr addr = { .ip = get->addr }; int ret = 0; rcu_read_lock(); if (get->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr, get->port); rcu_read_unlock(); if (svc) { int count = 0; struct ip_vs_dest *dest; struct ip_vs_dest_entry entry; struct ip_vs_kstats kstats; memset(&entry, 0, sizeof(entry)); list_for_each_entry(dest, &svc->destinations, n_list) { if (count >= get->num_dests) break; /* Cannot expose heterogeneous members via sockopt * interface */ if (dest->af != svc->af) continue; entry.addr = dest->addr.ip; entry.port = dest->port; entry.conn_flags = atomic_read(&dest->conn_flags); entry.weight = atomic_read(&dest->weight); entry.u_threshold = dest->u_threshold; entry.l_threshold = dest->l_threshold; entry.activeconns = atomic_read(&dest->activeconns); entry.inactconns = atomic_read(&dest->inactconns); entry.persistconns = atomic_read(&dest->persistconns); ip_vs_copy_stats(&kstats, &dest->stats); ip_vs_export_stats_user(&entry.stats, &kstats); if (copy_to_user(&uptr->entrytable[count], &entry, sizeof(entry))) { ret = -EFAULT; break; } count++; } } else ret = -ESRCH; return ret; } static inline void __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u) { #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP) struct ip_vs_proto_data *pd; #endif memset(u, 0, sizeof (*u)); #ifdef CONFIG_IP_VS_PROTO_TCP pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP); u->udp_timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = { [CMDID(IP_VS_SO_GET_VERSION)] = 64, [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo), [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services), [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry), [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests), [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user), [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user), }; union ip_vs_get_arglen { char field_IP_VS_SO_GET_VERSION[64]; struct ip_vs_getinfo field_IP_VS_SO_GET_INFO; struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES; struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE; struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS; struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT; struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2]; }; #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen) static int do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { unsigned char arg[MAX_GET_ARGLEN]; int ret = 0; unsigned int copylen; struct net *net = sock_net(sk); struct netns_ipvs *ipvs = net_ipvs(net); BUG_ON(!net); BUILD_BUG_ON(sizeof(arg) > 255); if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX) return -EINVAL; copylen = get_arglen[CMDID(cmd)]; if (*len < (int) copylen) { IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen); return -EINVAL; } if (copy_from_user(arg, user, copylen) != 0) return -EFAULT; /* * Handle daemons first since it has its own locking */ if (cmd == IP_VS_SO_GET_DAEMON) { struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; mutex_unlock(&ipvs->sync_mutex); return ret; } mutex_lock(&__ip_vs_mutex); switch (cmd) { case IP_VS_SO_GET_VERSION: { char buf[64]; sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size); if (copy_to_user(user, buf, strlen(buf)+1) != 0) { ret = -EFAULT; goto out; } *len = strlen(buf)+1; } break; case IP_VS_SO_GET_INFO: { struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } break; case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; int size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_service_entries(ipvs, get, user); } break; case IP_VS_SO_GET_SERVICE: { struct ip_vs_service_entry *entry; struct ip_vs_service *svc; union nf_inet_addr addr; entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; rcu_read_lock(); if (entry->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark); else svc = __ip_vs_service_find(ipvs, AF_INET, entry->protocol, &addr, entry->port); rcu_read_unlock(); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; } else ret = -ESRCH; } break; case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; int size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { pr_err("length: %u != %u\n", *len, size); ret = -EINVAL; goto out; } ret = __ip_vs_get_dest_entries(ipvs, get, user); } break; case IP_VS_SO_GET_TIMEOUT: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static struct nf_sockopt_ops ip_vs_sockopts = { .pf = PF_INET, .set_optmin = IP_VS_BASE_CTL, .set_optmax = IP_VS_SO_SET_MAX+1, .set = do_ip_vs_set_ctl, .get_optmin = IP_VS_BASE_CTL, .get_optmax = IP_VS_SO_GET_MAX+1, .get = do_ip_vs_get_ctl, .owner = THIS_MODULE, }; /* * Generic Netlink interface */ /* IPVS genetlink family */ static struct genl_family ip_vs_genl_family; /* Policy used for first-level command attributes */ static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN - 1 }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN - 1 }, [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, .len = sizeof(union nf_inet_addr) }, [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 }, [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 }, [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 }, }; static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, struct ip_vs_kstats *kstats) { struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type); if (!nl_stats) return -EMSGSIZE; if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, IPVS_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); return 0; nla_put_failure: nla_nest_cancel(skb, nl_stats); return -EMSGSIZE; } static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_service *svc) { struct ip_vs_scheduler *sched; struct ip_vs_pe *pe; struct nlattr *nl_service; struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; char *sched_name; nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) return -EMSGSIZE; if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af)) goto nla_put_failure; if (svc->fwmark) { if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark)) goto nla_put_failure; } else { if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) || nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) || nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port)) goto nla_put_failure; } sched = rcu_dereference_protected(svc->scheduler, 1); sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &svc->stats); if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_service); return 0; nla_put_failure: nla_nest_cancel(skb, nl_service); return -EMSGSIZE; } static int ip_vs_genl_dump_service(struct sk_buff *skb, struct ip_vs_service *svc, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_SERVICE); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_service(skb, svc) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_services(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { if (++idx <= start || (svc->ipvs != ipvs)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; goto nla_put_failure; } } } nla_put_failure: mutex_unlock(&__ip_vs_mutex); cb->args[0] = idx; return skb->len; } static bool ip_vs_is_af_valid(int af) { if (af == AF_INET) return true; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6 && ipv6_mod_enabled()) return true; #endif return false; } static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, bool full_entry, struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; nla_port = attrs[IPVS_SVC_ATTR_PORT]; nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) return -EINVAL; memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { usvc->protocol = IPPROTO_TCP; usvc->fwmark = nla_get_u32(nla_fwmark); } else { usvc->protocol = nla_get_u16(nla_protocol); nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); usvc->port = nla_get_be16(nla_port); usvc->fwmark = 0; } rcu_read_lock(); if (usvc->fwmark) svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark); else svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol, &usvc->addr, usvc->port); rcu_read_unlock(); *ret_svc = svc; /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) return -EINVAL; nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ if (svc) usvc->flags = svc->flags; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_be32(nla_netmask); } return 0; } static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs, struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc); return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) { struct nlattr *nl_dest; struct ip_vs_kstats kstats; nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST); if (!nl_dest) return -EMSGSIZE; if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) || nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) || nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD, (atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK)) || nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)) || nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE, dest->tun_type) || nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT, dest->tun_port) || nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS, dest->tun_flags) || nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) || nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, atomic_read(&dest->activeconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS, atomic_read(&dest->inactconns)) || nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, atomic_read(&dest->persistconns)) || nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af)) goto nla_put_failure; ip_vs_copy_stats(&kstats, &dest->stats); if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats)) goto nla_put_failure; if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats)) goto nla_put_failure; nla_nest_end(skb, nl_dest); return 0; nla_put_failure: nla_nest_cancel(skb, nl_dest); return -EMSGSIZE; } static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DEST); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_dest(skb, dest) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0; int start = cb->args[0]; struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&__ip_vs_mutex); /* Try to find the service for which to dump destinations */ if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack)) goto out_err; svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR_OR_NULL(svc)) goto out_err; /* Dump the destinations */ list_for_each_entry(dest, &svc->destinations, n_list) { if (++idx <= start) continue; if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { idx--; goto nla_put_failure; } } nla_put_failure: cb->args[0] = idx; out_err: mutex_unlock(&__ip_vs_mutex); return skb->len; } static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, struct nlattr *nla, bool full_entry) { struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; struct nlattr *nla_addr, *nla_port; struct nlattr *nla_addr_family; /* Parse mandatory identifying destination fields first */ if (nla == NULL || nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; nla_port = attrs[IPVS_DEST_ATTR_PORT]; nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY]; if (!(nla_addr && nla_port)) return -EINVAL; memset(udest, 0, sizeof(*udest)); nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); udest->port = nla_get_be16(nla_port); udest->af = nla_get_u16_default(nla_addr_family, 0); /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, *nla_l_thresh, *nla_tun_type, *nla_tun_port, *nla_tun_flags; nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE]; nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT]; nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS]; if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) return -EINVAL; udest->conn_flags = nla_get_u32(nla_fwd) & IP_VS_CONN_F_FWD_MASK; udest->weight = nla_get_u32(nla_weight); udest->u_threshold = nla_get_u32(nla_u_thresh); udest->l_threshold = nla_get_u32(nla_l_thresh); if (nla_tun_type) udest->tun_type = nla_get_u8(nla_tun_type); if (nla_tun_port) udest->tun_port = nla_get_be16(nla_tun_port); if (nla_tun_flags) udest->tun_flags = nla_get_u16(nla_tun_flags); } return 0; } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON); if (!nl_daemon) return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; #ifdef CONFIG_IP_VS_IPV6 if (c->mcast_af == AF_INET6) { if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, &c->mcast_group.in6)) goto nla_put_failure; } else #endif if (c->mcast_af == AF_INET && nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, c->mcast_group.ip)) goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; nla_put_failure: nla_nest_cancel(skb, nl_daemon); return -EMSGSIZE; } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ip_vs_genl_family, NLM_F_MULTI, IPVS_CMD_NEW_DAEMON); if (!hdr) return -EMSGSIZE; if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; } nla_put_failure: mutex_unlock(&ipvs->sync_mutex); return skb->len; } static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ipvs_sync_daemon_cfg c; struct nlattr *a; int ret; memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; if (a) c.sync_maxlen = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; if (a) { c.mcast_af = AF_INET; c.mcast_group.ip = nla_get_in_addr(a); if (!ipv4_is_multicast(c.mcast_group.ip)) return -EINVAL; } else { a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; if (a) { #ifdef CONFIG_IP_VS_IPV6 int addr_type; c.mcast_af = AF_INET6; c.mcast_group.in6 = nla_get_in6_addr(a); addr_type = ipv6_addr_type(&c.mcast_group.in6); if (!(addr_type & IPV6_ADDR_MULTICAST)) return -EINVAL; #else return -EAFNOSUPPORT; #endif } } a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; if (a) c.mcast_port = nla_get_u16(a); a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; if (a) c.mcast_ttl = nla_get_u8(a); /* The synchronization protocol is incompatible with mixed family * services */ if (ipvs->mixed_address_family_dests > 0) return -EINVAL; ret = start_sync_thread(ipvs, &c, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) { int ret; if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; ret = stop_sync_thread(ipvs, nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); return ret; } static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs) { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) t.tcp_fin_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); return ip_vs_set_timeout(ipvs, &t); } static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { int ret = -EINVAL, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs); else ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs); } out: return ret; } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) { bool need_full_svc = false, need_full_dest = false; struct ip_vs_service *svc = NULL; struct ip_vs_service_user_kern usvc; struct ip_vs_dest_user_kern udest; int ret = 0, cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { ret = ip_vs_flush(ipvs, false); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { ret = ip_vs_genl_set_config(ipvs, info->attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { ret = ip_vs_zero_all(ipvs); goto out; } /* All following commands require a service argument, so check if we * received a valid one. We need a full service specification when * adding / editing a service. Only identifying members otherwise. */ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = true; ret = ip_vs_genl_parse_service(ipvs, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) goto out; /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; goto out; } /* Destination commands require a valid destination argument. For * adding / editing a destination, we need a full destination * specification. */ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || cmd == IPVS_CMD_DEL_DEST) { if (cmd != IPVS_CMD_DEL_DEST) need_full_dest = true; ret = ip_vs_genl_parse_dest(&udest, info->attrs[IPVS_CMD_ATTR_DEST], need_full_dest); if (ret) goto out; /* Old protocols did not allow the user to specify address * family, so we set it to zero instead. We also didn't * allow heterogeneous pools in the old code, so it's safe * to assume that this will have the same address family as * the service. */ if (udest.af == 0) udest.af = svc->af; if (!ip_vs_is_af_valid(udest.af)) { ret = -EAFNOSUPPORT; goto out; } if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services */ if (ipvs->sync_state) { ret = -EINVAL; goto out; } /* Which connection types do we support? */ switch (udest.conn_flags) { case IP_VS_CONN_F_TUNNEL: /* We are able to forward this */ break; default: ret = -EINVAL; goto out; } } } switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) ret = ip_vs_add_service(ipvs, &usvc, &svc); else ret = -EEXIST; break; case IPVS_CMD_SET_SERVICE: ret = ip_vs_edit_service(svc, &usvc); break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); break; case IPVS_CMD_SET_DEST: ret = ip_vs_edit_dest(svc, &udest); break; case IPVS_CMD_DEL_DEST: ret = ip_vs_del_dest(svc, &udest); break; case IPVS_CMD_ZERO: ret = ip_vs_zero_service(svc); break; default: ret = -EINVAL; } out: mutex_unlock(&__ip_vs_mutex); return ret; } static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; struct net *net = sock_net(skb->sk); struct netns_ipvs *ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) reply_cmd = IPVS_CMD_NEW_SERVICE; else if (cmd == IPVS_CMD_GET_INFO) reply_cmd = IPVS_CMD_SET_INFO; else if (cmd == IPVS_CMD_GET_CONFIG) reply_cmd = IPVS_CMD_SET_CONFIG; else { pr_err("unknown Generic Netlink command\n"); return -EINVAL; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; mutex_lock(&__ip_vs_mutex); reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); if (reply == NULL) goto nla_put_failure; switch (cmd) { case IPVS_CMD_GET_SERVICE: { struct ip_vs_service *svc; svc = ip_vs_genl_find_service(ipvs, info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); if (ret) goto nla_put_failure; } else { ret = -ESRCH; goto out_err; } break; } case IPVS_CMD_GET_CONFIG: { struct ip_vs_timeout_user t; __ip_vs_get_timeouts(ipvs, &t); #ifdef CONFIG_IP_VS_PROTO_TCP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout) || nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, t.tcp_fin_timeout)) goto nla_put_failure; #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout)) goto nla_put_failure; #endif break; } case IPVS_CMD_GET_INFO: if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE) || nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, ip_vs_conn_tab_size)) goto nla_put_failure; break; } genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); goto out; nla_put_failure: pr_err("not enough space in Netlink message\n"); ret = -EMSGSIZE; out_err: nlmsg_free(msg); out: mutex_unlock(&__ip_vs_mutex); return ret; } static const struct genl_small_ops ip_vs_genl_ops[] = { { .cmd = IPVS_CMD_NEW_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_SERVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, .dumpit = ip_vs_genl_dump_services, }, { .cmd = IPVS_CMD_NEW_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_SET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_DEL_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_DEST, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_dests, }, { .cmd = IPVS_CMD_NEW_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_DEL_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_daemon, }, { .cmd = IPVS_CMD_GET_DAEMON, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .dumpit = ip_vs_genl_dump_daemons, }, { .cmd = IPVS_CMD_SET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_GET_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_GET_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_get_cmd, }, { .cmd = IPVS_CMD_ZERO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, { .cmd = IPVS_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_ADMIN_PERM, .doit = ip_vs_genl_set_cmd, }, }; static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_ATTR_MAX, .policy = ip_vs_cmd_policy, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .small_ops = ip_vs_genl_ops, .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops), .resv_start_op = IPVS_CMD_FLUSH + 1, }; static int __init ip_vs_genl_register(void) { return genl_register_family(&ip_vs_genl_family); } static void ip_vs_genl_unregister(void) { genl_unregister_family(&ip_vs_genl_family); } /* End of Generic Netlink interface definitions */ /* * per netns intit/exit func. */ #ifdef CONFIG_SYSCTL static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; struct ctl_table *tbl; int idx, ret; size_t ctl_table_size = ARRAY_SIZE(vs_vars); bool unpriv = net->user_ns != &init_user_ns; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); if (tbl == NULL) return -ENOMEM; } else tbl = vs_vars; /* Initialize sysctl defaults */ for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) { if (tbl[idx].proc_handler == proc_do_defense_mode) tbl[idx].extra2 = ipvs; } idx = 0; ipvs->sysctl_amemthresh = 1024; tbl[idx++].data = &ipvs->sysctl_amemthresh; ipvs->sysctl_am_droprate = 10; tbl[idx++].data = &ipvs->sysctl_am_droprate; tbl[idx++].data = &ipvs->sysctl_drop_entry; tbl[idx++].data = &ipvs->sysctl_drop_packet; #ifdef CONFIG_IP_VS_NFCT tbl[idx++].data = &ipvs->sysctl_conntrack; #endif tbl[idx++].data = &ipvs->sysctl_secure_tcp; ipvs->sysctl_snat_reroute = 1; tbl[idx++].data = &ipvs->sysctl_snat_reroute; ipvs->sysctl_sync_ver = 1; tbl[idx++].data = &ipvs->sysctl_sync_ver; ipvs->sysctl_sync_ports = 1; tbl[idx++].data = &ipvs->sysctl_sync_ports; tbl[idx++].data = &ipvs->sysctl_sync_persist_mode; ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_qlen_max; ipvs->sysctl_sync_sock_size = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx++].data = &ipvs->sysctl_sync_sock_size; tbl[idx++].data = &ipvs->sysctl_cache_bypass; tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; tbl[idx++].data = &ipvs->sysctl_sloppy_tcp; tbl[idx++].data = &ipvs->sysctl_sloppy_sctp; tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD; ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD; tbl[idx].data = &ipvs->sysctl_sync_threshold; tbl[idx].extra2 = ipvs; tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD; tbl[idx++].data = &ipvs->sysctl_sync_refresh_period; ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3); tbl[idx++].data = &ipvs->sysctl_sync_retries; tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; ipvs->sysctl_pmtu_disc = 1; tbl[idx++].data = &ipvs->sysctl_pmtu_disc; tbl[idx++].data = &ipvs->sysctl_backup_only; ipvs->sysctl_conn_reuse_mode = 1; tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_cpulist; ipvs->sysctl_est_nice = IPVS_EST_NICE; if (unpriv) tbl[idx].mode = 0444; tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_est_nice; #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) tbl[idx++].mode = 0444; #endif ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl, ctl_table_size); if (!ipvs->sysctl_hdr) goto err; ipvs->sysctl_tbl = tbl; ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); if (ret < 0) goto err; /* Schedule defense work */ queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); return 0; err: unregister_net_sysctl_table(ipvs->sysctl_hdr); if (!net_eq(net, &init_net)) kfree(tbl); return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work); cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); } #else static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { } #endif static struct notifier_block ip_vs_dst_notifier = { .notifier_call = ip_vs_dst_event, #ifdef CONFIG_IP_VS_IPV6 .priority = ADDRCONF_NOTIFY_PRIORITY + 5, #endif }; int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { int ret = -ENOMEM; int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) INIT_HLIST_HEAD(&ipvs->rs_table[idx]); INIT_LIST_HEAD(&ipvs->dest_trash); spin_lock_init(&ipvs->dest_trash_lock); timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); if (!ipvs->tot_stats) goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) goto err_vs; if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net, ip_vs_stats_show, NULL)) goto err_stats; if (!proc_create_net_single("ip_vs_stats_percpu", 0, ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; #endif ret = ip_vs_control_net_init_sysctl(ipvs); if (ret < 0) goto err; return 0; err: #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); err_percpu: remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); err_stats: remove_proc_entry("ip_vs", ipvs->net->proc_net); err_vs: #endif ip_vs_stats_release(&ipvs->tot_stats->s); err_tot_stats: kfree(ipvs->tot_stats); out: return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) { int ret; ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); goto err_genl; } return 0; err_genl: nf_unregister_sockopt(&ip_vs_sockopts); err_sock: return ret; } void ip_vs_unregister_nl_ioctl(void) { ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); } int __init ip_vs_control_init(void) { int idx; int ret; /* Initialize svc_table, ip_vs_svc_fwm_table */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_HLIST_HEAD(&ip_vs_svc_table[idx]); INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]); } smp_wmb(); /* Do we really need it now ? */ ret = register_netdevice_notifier(&ip_vs_dst_notifier); if (ret < 0) return ret; return 0; } void ip_vs_control_cleanup(void) { unregister_netdevice_notifier(&ip_vs_dst_notifier); /* relying on common rcu_barrier() in ip_vs_cleanup() */ } |
2 121 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV6_H_ #define _NF_TABLES_IPV6_H_ #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/ipv6.h> #include <net/netfilter/nf_tables.h> static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) { nft_set_pktinfo_unspec(pkt); return; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; } static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; u32 pkt_len, skb_len; int protohdr; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; pkt_len = ntohs(ip6h->payload_len); skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; #else return -1; #endif } static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; unsigned short frag_off; unsigned int thoff = 0; struct inet6_dev *idev; struct ipv6hdr *ip6h; int protohdr; u32 pkt_len; if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; inhdr_error: idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INHDRERRORS); return -1; #else return -1; #endif } #endif |
2 4 4 9 6 3 2 2 1 2 2 1 1 1 1 1 1 9 4 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H #include <linux/rcupdate_wait.h> #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) #define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) #define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) #define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) #define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) #define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) #define mtype_memsize IPSET_TOKEN(MTYPE, _memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_add IPSET_TOKEN(MTYPE, _add) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } static void mtype_ext_cleanup(struct ip_set *set) { struct mtype *map = set->data; u32 id; for (id = 0; id < map->elements; id++) if (test_bit(id, map->members)) ip_set_ext_destroy(set, get_ext(set, map, id)); } static void mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); ip_set_free(map); set->data = NULL; } static void mtype_flush(struct ip_set *set) { struct mtype *map = set->data; if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } /* Calculate the actual memory size of the set data */ static size_t mtype_memsize(const struct mtype *map, size_t dsize) { return sizeof(*map) + map->memsize + map->elements * dsize; } static int mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_test(e, map, set->dsize); if (ret <= 0) return ret; return ip_set_match_extensions(set, ext, mext, flags, x); } static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_add(e, map, flags, set->dsize); if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) { set->elements--; ret = 0; } else if (!(flags & IPSET_FLAG_EXIST)) { set_bit(e->id, map->members); return -IPSET_ERR_EXIST; } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } if (ret > 0) set->elements--; if (SET_WITH_TIMEOUT(set)) #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); #else ip_set_timeout_set(ext_timeout(x, set), ext->timeout); #endif if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); /* Activate element */ set_bit(e->id, map->members); set->elements++; return 0; } static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); if (mtype_do_del(e, map)) return -IPSET_ERR_EXIST; ip_set_ext_destroy(set, x); set->elements--; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) return -IPSET_ERR_EXIST; return 0; } #ifndef IP_SET_BITMAP_STORED_TIMEOUT static bool mtype_is_filled(const struct mtype_elem *x) { return true; } #endif static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { struct mtype *map = set->data; struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; int ret = 0; adt = nla_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; /* Extensions may be replaced */ rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { cond_resched_rcu(); id = cb->args[IPSET_CB_ARG0]; x = get_ext(set, map, id); if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_is_filled(x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); ret = -EMSGSIZE; goto out; } goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; nla_nest_end(skb, nested); } nla_nest_end(skb, adt); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } nla_nest_end(skb, adt); out: rcu_read_unlock(); return ret; } static void mtype_gc(struct timer_list *t) { struct mtype *map = from_timer(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; /* We run parallel with other readers (test element) * but adding/deleting new entries is locked out */ spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); if (ip_set_timeout_expired(ext_timeout(x, set))) { clear_bit(id, map->members); ip_set_ext_destroy(set, x); set->elements--; } } spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void mtype_cancel_gc(struct ip_set *set) { struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); } static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, .adt = { [IPSET_ADD] = mtype_add, [IPSET_DEL] = mtype_del, [IPSET_TEST] = mtype_test, }, .destroy = mtype_destroy, .flush = mtype_flush, .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ |
26 27 3 1333 1328 1345 1329 3 1345 960 945 14 960 988 196 3036 11 1607 1553 99 561 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 | // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved. * * This driver produces cryptographically secure pseudorandom data. It is divided * into roughly six sections, each with a section header: * * - Initialization and readiness waiting. * - Fast key erasure RNG, the "crng". * - Entropy accumulation and extraction routines. * - Entropy collection routines. * - Userspace reader/writer interfaces. * - Sysctl interface. * * The high level overview is that there is one input pool, into which * various pieces of data are hashed. Prior to initialization, some of that * data is then "credited" as having a certain number of bits of entropy. * When enough bits of entropy are available, the hash is finalized and * handed as a key to a stream cipher that expands it indefinitely for * various consumers. This key is periodically refreshed as the various * entropy collectors, described below, add data to the input pool. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/utsname.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/string.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/random.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/mm.h> #include <linux/nodemask.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/workqueue.h> #include <linux/irq.h> #include <linux/ratelimit.h> #include <linux/syscalls.h> #include <linux/completion.h> #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/suspend.h> #include <linux/siphash.h> #include <linux/sched/isolation.h> #include <crypto/chacha.h> #include <crypto/blake2s.h> #ifdef CONFIG_VDSO_GETRANDOM #include <vdso/getrandom.h> #include <vdso/datapage.h> #include <vdso/vsyscall.h> #endif #include <asm/archrandom.h> #include <asm/processor.h> #include <asm/irq.h> #include <asm/irq_regs.h> #include <asm/io.h> /********************************************************************* * * Initialization and readiness waiting. * * Much of the RNG infrastructure is devoted to various dependencies * being able to wait until the RNG has collected enough entropy and * is ready for safe consumption. * *********************************************************************/ /* * crng_init is protected by base_crng->lock, and only increases * its value (from empty->early->ready). */ static enum { CRNG_EMPTY = 0, /* Little to no entropy collected */ CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */ CRNG_READY = 2 /* Fully initialized with POOL_READY_BITS collected */ } crng_init __read_mostly = CRNG_EMPTY; static DEFINE_STATIC_KEY_FALSE(crng_is_ready); #define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY) /* Various types of waiters for crng_init->CRNG_READY transition. */ static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait); static struct fasync_struct *fasync; static ATOMIC_NOTIFIER_HEAD(random_ready_notifier); /* Control how we warn userspace. */ static struct ratelimit_state urandom_warning = RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE); static int ratelimit_disable __read_mostly = IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM); module_param_named(ratelimit_disable, ratelimit_disable, int, 0644); MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression"); /* * Returns whether or not the input pool has been seeded and thus guaranteed * to supply cryptographically secure random numbers. This applies to: the * /dev/urandom device, the get_random_bytes function, and the get_random_{u8, * u16,u32,u64,long} family of functions. * * Returns: true if the input pool has been seeded. * false if the input pool has not been seeded. */ bool rng_is_initialized(void) { return crng_ready(); } EXPORT_SYMBOL(rng_is_initialized); static void __cold crng_set_ready(struct work_struct *work) { static_branch_enable(&crng_is_ready); } /* Used by wait_for_random_bytes(), and considered an entropy collector, below. */ static void try_to_generate_entropy(void); /* * Wait for the input pool to be seeded and thus guaranteed to supply * cryptographically secure random numbers. This applies to: the /dev/urandom * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64, * long} family of functions. Using any of these functions without first * calling this function forfeits the guarantee of security. * * Returns: 0 if the input pool has been seeded. * -ERESTARTSYS if the function was interrupted by a signal. */ int wait_for_random_bytes(void) { while (!crng_ready()) { int ret; try_to_generate_entropy(); ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ); if (ret) return ret > 0 ? 0 : ret; } return 0; } EXPORT_SYMBOL(wait_for_random_bytes); /* * Add a callback function that will be invoked when the crng is initialised, * or immediately if it already has been. Only use this is you are absolutely * sure it is required. Most users should instead be able to test * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`. */ int __cold execute_with_initialized_rng(struct notifier_block *nb) { unsigned long flags; int ret = 0; spin_lock_irqsave(&random_ready_notifier.lock, flags); if (crng_ready()) nb->notifier_call(nb, 0, NULL); else ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb); spin_unlock_irqrestore(&random_ready_notifier.lock, flags); return ret; } #define warn_unseeded_randomness() \ if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \ printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \ __func__, (void *)_RET_IP_, crng_init) /********************************************************************* * * Fast key erasure RNG, the "crng". * * These functions expand entropy from the entropy extractor into * long streams for external consumption using the "fast key erasure" * RNG described at <https://blog.cr.yp.to/20170723-random.html>. * * There are a few exported interfaces for use by other drivers: * * void get_random_bytes(void *buf, size_t len) * u8 get_random_u8() * u16 get_random_u16() * u32 get_random_u32() * u32 get_random_u32_below(u32 ceil) * u32 get_random_u32_above(u32 floor) * u32 get_random_u32_inclusive(u32 floor, u32 ceil) * u64 get_random_u64() * unsigned long get_random_long() * * These interfaces will return the requested number of random bytes * into the given buffer or as a return value. This is equivalent to * a read from /dev/urandom. The u8, u16, u32, u64, long family of * functions may be higher performance for one-off random integers, * because they do a bit of buffering and do not invoke reseeding * until the buffer is emptied. * *********************************************************************/ enum { CRNG_RESEED_START_INTERVAL = HZ, CRNG_RESEED_INTERVAL = 60 * HZ }; static struct { u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long)); unsigned long generation; spinlock_t lock; } base_crng = { .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock) }; struct crng { u8 key[CHACHA_KEY_SIZE]; unsigned long generation; local_lock_t lock; }; static DEFINE_PER_CPU(struct crng, crngs) = { .generation = ULONG_MAX, .lock = INIT_LOCAL_LOCK(crngs.lock), }; /* * Return the interval until the next reseeding, which is normally * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval * proportional to the uptime. */ static unsigned int crng_reseed_interval(void) { static bool early_boot = true; if (unlikely(READ_ONCE(early_boot))) { time64_t uptime = ktime_get_seconds(); if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2) WRITE_ONCE(early_boot, false); else return max_t(unsigned int, CRNG_RESEED_START_INTERVAL, (unsigned int)uptime / 2 * HZ); } return CRNG_RESEED_INTERVAL; } /* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */ static void extract_entropy(void *buf, size_t len); /* This extracts a new crng key from the input pool. */ static void crng_reseed(struct work_struct *work) { static DECLARE_DELAYED_WORK(next_reseed, crng_reseed); unsigned long flags; unsigned long next_gen; u8 key[CHACHA_KEY_SIZE]; /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */ if (likely(system_unbound_wq)) queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval()); extract_entropy(key, sizeof(key)); /* * We copy the new key into the base_crng, overwriting the old one, * and update the generation counter. We avoid hitting ULONG_MAX, * because the per-cpu crngs are initialized to ULONG_MAX, so this * forces new CPUs that come online to always initialize. */ spin_lock_irqsave(&base_crng.lock, flags); memcpy(base_crng.key, key, sizeof(base_crng.key)); next_gen = base_crng.generation + 1; if (next_gen == ULONG_MAX) ++next_gen; WRITE_ONCE(base_crng.generation, next_gen); #ifdef CONFIG_VDSO_GETRANDOM /* base_crng.generation's invalid value is ULONG_MAX, while * _vdso_rng_data.generation's invalid value is 0, so add one to the * former to arrive at the latter. Use smp_store_release so that this * is ordered with the write above to base_crng.generation. Pairs with * the smp_rmb() before the syscall in the vDSO code. * * Cast to unsigned long for 32-bit architectures, since atomic 64-bit * operations are not supported on those architectures. This is safe * because base_crng.generation is a 32-bit value. On big-endian * architectures it will be stored in the upper 32 bits, but that's okay * because the vDSO side only checks whether the value changed, without * actually using or interpreting the value. */ smp_store_release((unsigned long *)&__arch_get_k_vdso_rng_data()->generation, next_gen + 1); #endif if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; spin_unlock_irqrestore(&base_crng.lock, flags); memzero_explicit(key, sizeof(key)); } /* * This generates a ChaCha block using the provided key, and then * immediately overwrites that key with half the block. It returns * the resultant ChaCha state to the user, along with the second * half of the block containing 32 bytes of random data that may * be used; random_data_len may not be greater than 32. * * The returned ChaCha state contains within it a copy of the old * key value, at index 4, so the state should always be zeroed out * immediately after using in order to maintain forward secrecy. * If the state cannot be erased in a timely manner, then it is * safer to set the random_data parameter to &chacha_state[4] so * that this function overwrites it before returning. */ static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE], u32 chacha_state[CHACHA_STATE_WORDS], u8 *random_data, size_t random_data_len) { u8 first_block[CHACHA_BLOCK_SIZE]; BUG_ON(random_data_len > 32); chacha_init_consts(chacha_state); memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE); memset(&chacha_state[12], 0, sizeof(u32) * 4); chacha20_block(chacha_state, first_block); memcpy(key, first_block, CHACHA_KEY_SIZE); memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len); memzero_explicit(first_block, sizeof(first_block)); } /* * This function returns a ChaCha state that you may use for generating * random data. It also returns up to 32 bytes on its own of random data * that may be used; random_data_len may not be greater than 32. */ static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS], u8 *random_data, size_t random_data_len) { unsigned long flags; struct crng *crng; BUG_ON(random_data_len > 32); /* * For the fast path, we check whether we're ready, unlocked first, and * then re-check once locked later. In the case where we're really not * ready, we do fast key erasure with the base_crng directly, extracting * when crng_init is CRNG_EMPTY. */ if (!crng_ready()) { bool ready; spin_lock_irqsave(&base_crng.lock, flags); ready = crng_ready(); if (!ready) { if (crng_init == CRNG_EMPTY) extract_entropy(base_crng.key, sizeof(base_crng.key)); crng_fast_key_erasure(base_crng.key, chacha_state, random_data, random_data_len); } spin_unlock_irqrestore(&base_crng.lock, flags); if (!ready) return; } local_lock_irqsave(&crngs.lock, flags); crng = raw_cpu_ptr(&crngs); /* * If our per-cpu crng is older than the base_crng, then it means * somebody reseeded the base_crng. In that case, we do fast key * erasure on the base_crng, and use its output as the new key * for our per-cpu crng. This brings us up to date with base_crng. */ if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) { spin_lock(&base_crng.lock); crng_fast_key_erasure(base_crng.key, chacha_state, crng->key, sizeof(crng->key)); crng->generation = base_crng.generation; spin_unlock(&base_crng.lock); } /* * Finally, when we've made it this far, our per-cpu crng has an up * to date key, and we can do fast key erasure with it to produce * some random data and a ChaCha state for the caller. All other * branches of this function are "unlikely", so most of the time we * should wind up here immediately. */ crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len); local_unlock_irqrestore(&crngs.lock, flags); } static void _get_random_bytes(void *buf, size_t len) { u32 chacha_state[CHACHA_STATE_WORDS]; u8 tmp[CHACHA_BLOCK_SIZE]; size_t first_block_len; if (!len) return; first_block_len = min_t(size_t, 32, len); crng_make_state(chacha_state, buf, first_block_len); len -= first_block_len; buf += first_block_len; while (len) { if (len < CHACHA_BLOCK_SIZE) { chacha20_block(chacha_state, tmp); memcpy(buf, tmp, len); memzero_explicit(tmp, sizeof(tmp)); break; } chacha20_block(chacha_state, buf); if (unlikely(chacha_state[12] == 0)) ++chacha_state[13]; len -= CHACHA_BLOCK_SIZE; buf += CHACHA_BLOCK_SIZE; } memzero_explicit(chacha_state, sizeof(chacha_state)); } /* * This returns random bytes in arbitrary quantities. The quality of the * random bytes is good as /dev/urandom. In order to ensure that the * randomness provided by this function is okay, the function * wait_for_random_bytes() should be called and return 0 at least once * at any point prior. */ void get_random_bytes(void *buf, size_t len) { warn_unseeded_randomness(); _get_random_bytes(buf, len); } EXPORT_SYMBOL(get_random_bytes); static ssize_t get_random_bytes_user(struct iov_iter *iter) { u32 chacha_state[CHACHA_STATE_WORDS]; u8 block[CHACHA_BLOCK_SIZE]; size_t ret = 0, copied; if (unlikely(!iov_iter_count(iter))) return 0; /* * Immediately overwrite the ChaCha key at index 4 with random * bytes, in case userspace causes copy_to_iter() below to sleep * forever, so that we still retain forward secrecy in that case. */ crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE); /* * However, if we're doing a read of len <= 32, we don't need to * use chacha_state after, so we can simply return those bytes to * the user directly. */ if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) { ret = copy_to_iter(&chacha_state[4], CHACHA_KEY_SIZE, iter); goto out_zero_chacha; } for (;;) { chacha20_block(chacha_state, block); if (unlikely(chacha_state[12] == 0)) ++chacha_state[13]; copied = copy_to_iter(block, sizeof(block), iter); ret += copied; if (!iov_iter_count(iter) || copied != sizeof(block)) break; BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0); if (ret % PAGE_SIZE == 0) { if (signal_pending(current)) break; cond_resched(); } } memzero_explicit(block, sizeof(block)); out_zero_chacha: memzero_explicit(chacha_state, sizeof(chacha_state)); return ret ? ret : -EFAULT; } /* * Batched entropy returns random integers. The quality of the random * number is good as /dev/urandom. In order to ensure that the randomness * provided by this function is okay, the function wait_for_random_bytes() * should be called and return 0 at least once at any point prior. */ #define DEFINE_BATCHED_ENTROPY(type) \ struct batch_ ##type { \ /* \ * We make this 1.5x a ChaCha block, so that we get the \ * remaining 32 bytes from fast key erasure, plus one full \ * block from the detached ChaCha state. We can increase \ * the size of this later if needed so long as we keep the \ * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE. \ */ \ type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))]; \ local_lock_t lock; \ unsigned long generation; \ unsigned int position; \ }; \ \ static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = { \ .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock), \ .position = UINT_MAX \ }; \ \ type get_random_ ##type(void) \ { \ type ret; \ unsigned long flags; \ struct batch_ ##type *batch; \ unsigned long next_gen; \ \ warn_unseeded_randomness(); \ \ if (!crng_ready()) { \ _get_random_bytes(&ret, sizeof(ret)); \ return ret; \ } \ \ local_lock_irqsave(&batched_entropy_ ##type.lock, flags); \ batch = raw_cpu_ptr(&batched_entropy_##type); \ \ next_gen = READ_ONCE(base_crng.generation); \ if (batch->position >= ARRAY_SIZE(batch->entropy) || \ next_gen != batch->generation) { \ _get_random_bytes(batch->entropy, sizeof(batch->entropy)); \ batch->position = 0; \ batch->generation = next_gen; \ } \ \ ret = batch->entropy[batch->position]; \ batch->entropy[batch->position] = 0; \ ++batch->position; \ local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags); \ return ret; \ } \ EXPORT_SYMBOL(get_random_ ##type); DEFINE_BATCHED_ENTROPY(u8) DEFINE_BATCHED_ENTROPY(u16) DEFINE_BATCHED_ENTROPY(u32) DEFINE_BATCHED_ENTROPY(u64) u32 __get_random_u32_below(u32 ceil) { /* * This is the slow path for variable ceil. It is still fast, most of * the time, by doing traditional reciprocal multiplication and * opportunistically comparing the lower half to ceil itself, before * falling back to computing a larger bound, and then rejecting samples * whose lower half would indicate a range indivisible by ceil. The use * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable * in 32-bits. */ u32 rand = get_random_u32(); u64 mult; /* * This function is technically undefined for ceil == 0, and in fact * for the non-underscored constant version in the header, we build bug * on that. But for the non-constant case, it's convenient to have that * evaluate to being a straight call to get_random_u32(), so that * get_random_u32_inclusive() can work over its whole range without * undefined behavior. */ if (unlikely(!ceil)) return rand; mult = (u64)ceil * rand; if (unlikely((u32)mult < ceil)) { u32 bound = -ceil % ceil; while (unlikely((u32)mult < bound)) mult = (u64)ceil * get_random_u32(); } return mult >> 32; } EXPORT_SYMBOL(__get_random_u32_below); #ifdef CONFIG_SMP /* * This function is called when the CPU is coming up, with entry * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP. */ int __cold random_prepare_cpu(unsigned int cpu) { /* * When the cpu comes back online, immediately invalidate both * the per-cpu crng and all batches, so that we serve fresh * randomness. */ per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX; per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX; per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX; per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX; per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX; return 0; } #endif /********************************************************************** * * Entropy accumulation and extraction routines. * * Callers may add entropy via: * * static void mix_pool_bytes(const void *buf, size_t len) * * After which, if added entropy should be credited: * * static void credit_init_bits(size_t bits) * * Finally, extract entropy via: * * static void extract_entropy(void *buf, size_t len) * **********************************************************************/ enum { POOL_BITS = BLAKE2S_HASH_SIZE * 8, POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */ POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */ }; static struct { struct blake2s_state hash; spinlock_t lock; unsigned int init_bits; } input_pool = { .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE), BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4, BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 }, .hash.outlen = BLAKE2S_HASH_SIZE, .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock), }; static void _mix_pool_bytes(const void *buf, size_t len) { blake2s_update(&input_pool.hash, buf, len); } /* * This function adds bytes into the input pool. It does not * update the initialization bit counter; the caller should call * credit_init_bits if this is appropriate. */ static void mix_pool_bytes(const void *buf, size_t len) { unsigned long flags; spin_lock_irqsave(&input_pool.lock, flags); _mix_pool_bytes(buf, len); spin_unlock_irqrestore(&input_pool.lock, flags); } /* * This is an HKDF-like construction for using the hashed collected entropy * as a PRF key, that's then expanded block-by-block. */ static void extract_entropy(void *buf, size_t len) { unsigned long flags; u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE]; struct { unsigned long rdseed[32 / sizeof(long)]; size_t counter; } block; size_t i, longs; for (i = 0; i < ARRAY_SIZE(block.rdseed);) { longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); if (longs) { i += longs; continue; } longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); if (longs) { i += longs; continue; } block.rdseed[i++] = random_get_entropy(); } spin_lock_irqsave(&input_pool.lock, flags); /* seed = HASHPRF(last_key, entropy_input) */ blake2s_final(&input_pool.hash, seed); /* next_key = HASHPRF(seed, RDSEED || 0) */ block.counter = 0; blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed)); blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key)); spin_unlock_irqrestore(&input_pool.lock, flags); memzero_explicit(next_key, sizeof(next_key)); while (len) { i = min_t(size_t, len, BLAKE2S_HASH_SIZE); /* output = HASHPRF(seed, RDSEED || ++counter) */ ++block.counter; blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed)); len -= i; buf += i; } memzero_explicit(seed, sizeof(seed)); memzero_explicit(&block, sizeof(block)); } #define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits) static void __cold _credit_init_bits(size_t bits) { static DECLARE_WORK(set_ready, crng_set_ready); unsigned int new, orig, add; unsigned long flags; if (!bits) return; add = min_t(size_t, bits, POOL_BITS); orig = READ_ONCE(input_pool.init_bits); do { new = min_t(unsigned int, POOL_BITS, orig + add); } while (!try_cmpxchg(&input_pool.init_bits, &orig, new)); if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) { crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */ if (static_key_initialized && system_unbound_wq) queue_work(system_unbound_wq, &set_ready); atomic_notifier_call_chain(&random_ready_notifier, 0, NULL); #ifdef CONFIG_VDSO_GETRANDOM WRITE_ONCE(__arch_get_k_vdso_rng_data()->is_ready, true); #endif wake_up_interruptible(&crng_init_wait); kill_fasync(&fasync, SIGIO, POLL_IN); pr_notice("crng init done\n"); if (urandom_warning.missed) pr_notice("%d urandom warning(s) missed due to ratelimiting\n", urandom_warning.missed); } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) { spin_lock_irqsave(&base_crng.lock, flags); /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */ if (crng_init == CRNG_EMPTY) { extract_entropy(base_crng.key, sizeof(base_crng.key)); crng_init = CRNG_EARLY; } spin_unlock_irqrestore(&base_crng.lock, flags); } } /********************************************************************** * * Entropy collection routines. * * The following exported functions are used for pushing entropy into * the above entropy accumulation routines: * * void add_device_randomness(const void *buf, size_t len); * void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after); * void add_bootloader_randomness(const void *buf, size_t len); * void add_vmfork_randomness(const void *unique_vm_id, size_t len); * void add_interrupt_randomness(int irq); * void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); * void add_disk_randomness(struct gendisk *disk); * * add_device_randomness() adds data to the input pool that * is likely to differ between two devices (or possibly even per boot). * This would be things like MAC addresses or serial numbers, or the * read-out of the RTC. This does *not* credit any actual entropy to * the pool, but it initializes the pool to different values for devices * that might otherwise be identical and have very little entropy * available to them (particularly common in the embedded world). * * add_hwgenerator_randomness() is for true hardware RNGs, and will credit * entropy as specified by the caller. If the entropy pool is full it will * block until more entropy is needed. * * add_bootloader_randomness() is called by bootloader drivers, such as EFI * and device tree, and credits its input depending on whether or not the * command line option 'random.trust_bootloader'. * * add_vmfork_randomness() adds a unique (but not necessarily secret) ID * representing the current instance of a VM to the pool, without crediting, * and then force-reseeds the crng so that it takes effect immediately. * * add_interrupt_randomness() uses the interrupt timing as random * inputs to the entropy pool. Using the cycle counters and the irq source * as inputs, it feeds the input pool roughly once a second or after 64 * interrupts, crediting 1 bit of entropy for whichever comes first. * * add_input_randomness() uses the input layer interrupt timing, as well * as the event type information from the hardware. * * add_disk_randomness() uses what amounts to the seek time of block * layer request events, on a per-disk_devt basis, as input to the * entropy pool. Note that high-speed solid state drives with very low * seek times do not make for good sources of entropy, as their seek * times are usually fairly consistent. * * The last two routines try to estimate how many bits of entropy * to credit. They do this by keeping track of the first and second * order deltas of the event timings. * **********************************************************************/ static bool trust_cpu __initdata = true; static bool trust_bootloader __initdata = true; static int __init parse_trust_cpu(char *arg) { return kstrtobool(arg, &trust_cpu); } static int __init parse_trust_bootloader(char *arg) { return kstrtobool(arg, &trust_bootloader); } early_param("random.trust_cpu", parse_trust_cpu); early_param("random.trust_bootloader", parse_trust_bootloader); static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data) { unsigned long flags, entropy = random_get_entropy(); /* * Encode a representation of how long the system has been suspended, * in a way that is distinct from prior system suspends. */ ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() }; spin_lock_irqsave(&input_pool.lock, flags); _mix_pool_bytes(&action, sizeof(action)); _mix_pool_bytes(stamps, sizeof(stamps)); _mix_pool_bytes(&entropy, sizeof(entropy)); spin_unlock_irqrestore(&input_pool.lock, flags); if (crng_ready() && (action == PM_RESTORE_PREPARE || (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) && !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) { crng_reseed(NULL); pr_notice("crng reseeded on system resumption\n"); } return 0; } static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification }; /* * This is called extremely early, before time keeping functionality is * available, but arch randomness is. Interrupts are not yet enabled. */ void __init random_init_early(const char *command_line) { unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)]; size_t i, longs, arch_bits; #if defined(LATENT_ENTROPY_PLUGIN) static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy; _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed)); #endif for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; continue; } longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; continue; } arch_bits -= sizeof(*entropy) * 8; ++i; } _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname()))); _mix_pool_bytes(command_line, strlen(command_line)); /* Reseed if already seeded by earlier phases. */ if (crng_ready()) crng_reseed(NULL); else if (trust_cpu) _credit_init_bits(arch_bits); } /* * This is called a little bit after the prior function, and now there is * access to timestamps counters. Interrupts are not yet enabled. */ void __init random_init(void) { unsigned long entropy = random_get_entropy(); ktime_t now = ktime_get_real(); _mix_pool_bytes(&now, sizeof(now)); _mix_pool_bytes(&entropy, sizeof(entropy)); add_latent_entropy(); /* * If we were initialized by the cpu or bootloader before jump labels * or workqueues are initialized, then we should enable the static * branch here, where it's guaranteed that these have been initialized. */ if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY) crng_set_ready(NULL); /* Reseed if already seeded by earlier phases. */ if (crng_ready()) crng_reseed(NULL); WARN_ON(register_pm_notifier(&pm_notifier)); WARN(!entropy, "Missing cycle counter and fallback timer; RNG " "entropy collection will consequently suffer."); } /* * Add device- or boot-specific data to the input pool to help * initialize it. * * None of this adds any entropy; it is meant to avoid the problem of * the entropy pool having similar initial state across largely * identical devices. */ void add_device_randomness(const void *buf, size_t len) { unsigned long entropy = random_get_entropy(); unsigned long flags; spin_lock_irqsave(&input_pool.lock, flags); _mix_pool_bytes(&entropy, sizeof(entropy)); _mix_pool_bytes(buf, len); spin_unlock_irqrestore(&input_pool.lock, flags); } EXPORT_SYMBOL(add_device_randomness); /* * Interface for in-kernel drivers of true hardware RNGs. Those devices * may produce endless random bits, so this function will sleep for * some amount of time after, if the sleep_after parameter is true. */ void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after) { mix_pool_bytes(buf, len); credit_init_bits(entropy); /* * Throttle writing to once every reseed interval, unless we're not yet * initialized or no entropy is credited. */ if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy)) schedule_timeout_interruptible(crng_reseed_interval()); } EXPORT_SYMBOL_GPL(add_hwgenerator_randomness); /* * Handle random seed passed by bootloader, and credit it depending * on the command line option 'random.trust_bootloader'. */ void __init add_bootloader_randomness(const void *buf, size_t len) { mix_pool_bytes(buf, len); if (trust_bootloader) credit_init_bits(len * 8); } #if IS_ENABLED(CONFIG_VMGENID) static BLOCKING_NOTIFIER_HEAD(vmfork_chain); /* * Handle a new unique VM ID, which is unique, not secret, so we * don't credit it, but we do immediately force a reseed after so * that it's used by the crng posthaste. */ void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len) { add_device_randomness(unique_vm_id, len); if (crng_ready()) { crng_reseed(NULL); pr_notice("crng reseeded due to virtual machine fork\n"); } blocking_notifier_call_chain(&vmfork_chain, 0, NULL); } #if IS_MODULE(CONFIG_VMGENID) EXPORT_SYMBOL_GPL(add_vmfork_randomness); #endif int __cold register_random_vmfork_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&vmfork_chain, nb); } EXPORT_SYMBOL_GPL(register_random_vmfork_notifier); int __cold unregister_random_vmfork_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&vmfork_chain, nb); } EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier); #endif struct fast_pool { unsigned long pool[4]; unsigned long last; unsigned int count; struct timer_list mix; }; static void mix_interrupt_randomness(struct timer_list *work); static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = { #ifdef CONFIG_64BIT #define FASTMIX_PERM SIPHASH_PERMUTATION .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 }, #else #define FASTMIX_PERM HSIPHASH_PERMUTATION .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 }, #endif .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0) }; /* * This is [Half]SipHash-1-x, starting from an empty key. Because * the key is fixed, it assumes that its inputs are non-malicious, * and therefore this has no security on its own. s represents the * four-word SipHash state, while v represents a two-word input. */ static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2) { s[3] ^= v1; FASTMIX_PERM(s[0], s[1], s[2], s[3]); s[0] ^= v1; s[3] ^= v2; FASTMIX_PERM(s[0], s[1], s[2], s[3]); s[0] ^= v2; } #ifdef CONFIG_SMP /* * This function is called when the CPU has just come online, with * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE. */ int __cold random_online_cpu(unsigned int cpu) { /* * During CPU shutdown and before CPU onlining, add_interrupt_ * randomness() may schedule mix_interrupt_randomness(), and * set the MIX_INFLIGHT flag. However, because the worker can * be scheduled on a different CPU during this period, that * flag will never be cleared. For that reason, we zero out * the flag here, which runs just after workqueues are onlined * for the CPU again. This also has the effect of setting the * irq randomness count to zero so that new accumulated irqs * are fresh. */ per_cpu_ptr(&irq_randomness, cpu)->count = 0; return 0; } #endif static void mix_interrupt_randomness(struct timer_list *work) { struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix); /* * The size of the copied stack pool is explicitly 2 longs so that we * only ever ingest half of the siphash output each time, retaining * the other half as the next "key" that carries over. The entropy is * supposed to be sufficiently dispersed between bits so on average * we don't wind up "losing" some. */ unsigned long pool[2]; unsigned int count; /* Check to see if we're running on the wrong CPU due to hotplug. */ local_irq_disable(); if (fast_pool != this_cpu_ptr(&irq_randomness)) { local_irq_enable(); return; } /* * Copy the pool to the stack so that the mixer always has a * consistent view, before we reenable irqs again. */ memcpy(pool, fast_pool->pool, sizeof(pool)); count = fast_pool->count; fast_pool->count = 0; fast_pool->last = jiffies; local_irq_enable(); mix_pool_bytes(pool, sizeof(pool)); credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8)); memzero_explicit(pool, sizeof(pool)); } void add_interrupt_randomness(int irq) { enum { MIX_INFLIGHT = 1U << 31 }; unsigned long entropy = random_get_entropy(); struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); struct pt_regs *regs = get_irq_regs(); unsigned int new_count; fast_mix(fast_pool->pool, entropy, (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq)); new_count = ++fast_pool->count; if (new_count & MIX_INFLIGHT) return; if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ)) return; fast_pool->count |= MIX_INFLIGHT; if (!timer_pending(&fast_pool->mix)) { fast_pool->mix.expires = jiffies; add_timer_on(&fast_pool->mix, raw_smp_processor_id()); } } EXPORT_SYMBOL_GPL(add_interrupt_randomness); /* There is one of these per entropy source */ struct timer_rand_state { unsigned long last_time; long last_delta, last_delta2; }; /* * This function adds entropy to the entropy "pool" by using timing * delays. It uses the timer_rand_state structure to make an estimate * of how many bits of entropy this call has added to the pool. The * value "num" is also added to the pool; it should somehow describe * the type of event that just happened. */ static void add_timer_randomness(struct timer_rand_state *state, unsigned int num) { unsigned long entropy = random_get_entropy(), now = jiffies, flags; long delta, delta2, delta3; unsigned int bits; /* * If we're in a hard IRQ, add_interrupt_randomness() will be called * sometime after, so mix into the fast pool. */ if (in_hardirq()) { fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num); } else { spin_lock_irqsave(&input_pool.lock, flags); _mix_pool_bytes(&entropy, sizeof(entropy)); _mix_pool_bytes(&num, sizeof(num)); spin_unlock_irqrestore(&input_pool.lock, flags); } if (crng_ready()) return; /* * Calculate number of bits of randomness we probably added. * We take into account the first, second and third-order deltas * in order to make our estimate. */ delta = now - READ_ONCE(state->last_time); WRITE_ONCE(state->last_time, now); delta2 = delta - READ_ONCE(state->last_delta); WRITE_ONCE(state->last_delta, delta); delta3 = delta2 - READ_ONCE(state->last_delta2); WRITE_ONCE(state->last_delta2, delta2); if (delta < 0) delta = -delta; if (delta2 < 0) delta2 = -delta2; if (delta3 < 0) delta3 = -delta3; if (delta > delta2) delta = delta2; if (delta > delta3) delta = delta3; /* * delta is now minimum absolute delta. Round down by 1 bit * on general principles, and limit entropy estimate to 11 bits. */ bits = min(fls(delta >> 1), 11); /* * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness() * will run after this, which uses a different crediting scheme of 1 bit * per every 64 interrupts. In order to let that function do accounting * close to the one in this function, we credit a full 64/64 bit per bit, * and then subtract one to account for the extra one added. */ if (in_hardirq()) this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1; else _credit_init_bits(bits); } void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) { static unsigned char last_value; static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES }; /* Ignore autorepeat and the like. */ if (value == last_value) return; last_value = value; add_timer_randomness(&input_timer_state, (type << 4) ^ code ^ (code >> 4) ^ value); } EXPORT_SYMBOL_GPL(add_input_randomness); #ifdef CONFIG_BLOCK void add_disk_randomness(struct gendisk *disk) { if (!disk || !disk->random) return; /* First major is 1, so we get >= 0x200 here. */ add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); } EXPORT_SYMBOL_GPL(add_disk_randomness); void __cold rand_initialize_disk(struct gendisk *disk) { struct timer_rand_state *state; /* * If kzalloc returns null, we just won't use that entropy * source. */ state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL); if (state) { state->last_time = INITIAL_JIFFIES; disk->random = state; } } #endif struct entropy_timer_state { unsigned long entropy; struct timer_list timer; atomic_t samples; unsigned int samples_per_bit; }; /* * Each time the timer fires, we expect that we got an unpredictable jump in * the cycle counter. Even if the timer is running on another CPU, the timer * activity will be touching the stack of the CPU that is generating entropy. * * Note that we don't re-arm the timer in the timer itself - we are happy to be * scheduled away, since that just makes the load more complex, but we do not * want the timer to keep ticking unless the entropy loop is running. * * So the re-arming always happens in the entropy loop itself. */ static void __cold entropy_timer(struct timer_list *timer) { struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer); unsigned long entropy = random_get_entropy(); mix_pool_bytes(&entropy, sizeof(entropy)); if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0) credit_init_bits(1); } /* * If we have an actual cycle counter, see if we can generate enough entropy * with timing noise. */ static void __cold try_to_generate_entropy(void) { enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 }; u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1]; struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES); unsigned int i, num_different = 0; unsigned long last = random_get_entropy(); int cpu = -1; for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) { stack->entropy = random_get_entropy(); if (stack->entropy != last) ++num_different; last = stack->entropy; } stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1); if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT) return; atomic_set(&stack->samples, 0); timer_setup_on_stack(&stack->timer, entropy_timer, 0); while (!crng_ready() && !signal_pending(current)) { /* * Check !timer_pending() and then ensure that any previous callback has finished * executing by checking try_to_del_timer_sync(), before queueing the next one. */ if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) { struct cpumask timer_cpus; unsigned int num_cpus; /* * Preemption must be disabled here, both to read the current CPU number * and to avoid scheduling a timer on a dead CPU. */ preempt_disable(); /* Only schedule callbacks on timer CPUs that are online. */ cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask); num_cpus = cpumask_weight(&timer_cpus); /* In very bizarre case of misconfiguration, fallback to all online. */ if (unlikely(num_cpus == 0)) { timer_cpus = *cpu_online_mask; num_cpus = cpumask_weight(&timer_cpus); } /* Basic CPU round-robin, which avoids the current CPU. */ do { cpu = cpumask_next(cpu, &timer_cpus); if (cpu >= nr_cpu_ids) cpu = cpumask_first(&timer_cpus); } while (cpu == smp_processor_id() && num_cpus > 1); /* Expiring the timer at `jiffies` means it's the next tick. */ stack->timer.expires = jiffies; add_timer_on(&stack->timer, cpu); preempt_enable(); } mix_pool_bytes(&stack->entropy, sizeof(stack->entropy)); schedule(); stack->entropy = random_get_entropy(); } mix_pool_bytes(&stack->entropy, sizeof(stack->entropy)); del_timer_sync(&stack->timer); destroy_timer_on_stack(&stack->timer); } /********************************************************************** * * Userspace reader/writer interfaces. * * getrandom(2) is the primary modern interface into the RNG and should * be used in preference to anything else. * * Reading from /dev/random has the same functionality as calling * getrandom(2) with flags=0. In earlier versions, however, it had * vastly different semantics and should therefore be avoided, to * prevent backwards compatibility issues. * * Reading from /dev/urandom has the same functionality as calling * getrandom(2) with flags=GRND_INSECURE. Because it does not block * waiting for the RNG to be ready, it should not be used. * * Writing to either /dev/random or /dev/urandom adds entropy to * the input pool but does not credit it. * * Polling on /dev/random indicates when the RNG is initialized, on * the read side, and when it wants new entropy, on the write side. * * Both /dev/random and /dev/urandom have the same set of ioctls for * adding entropy, getting the entropy count, zeroing the count, and * reseeding the crng. * **********************************************************************/ SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags) { struct iov_iter iter; int ret; if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE)) return -EINVAL; /* * Requesting insecure and blocking randomness at the same time makes * no sense. */ if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM)) return -EINVAL; if (!crng_ready() && !(flags & GRND_INSECURE)) { if (flags & GRND_NONBLOCK) return -EAGAIN; ret = wait_for_random_bytes(); if (unlikely(ret)) return ret; } ret = import_ubuf(ITER_DEST, ubuf, len, &iter); if (unlikely(ret)) return ret; return get_random_bytes_user(&iter); } static __poll_t random_poll(struct file *file, poll_table *wait) { poll_wait(file, &crng_init_wait, wait); return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM; } static ssize_t write_pool_user(struct iov_iter *iter) { u8 block[BLAKE2S_BLOCK_SIZE]; ssize_t ret = 0; size_t copied; if (unlikely(!iov_iter_count(iter))) return 0; for (;;) { copied = copy_from_iter(block, sizeof(block), iter); ret += copied; mix_pool_bytes(block, copied); if (!iov_iter_count(iter) || copied != sizeof(block)) break; BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0); if (ret % PAGE_SIZE == 0) { if (signal_pending(current)) break; cond_resched(); } } memzero_explicit(block, sizeof(block)); return ret ? ret : -EFAULT; } static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter) { return write_pool_user(iter); } static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter) { static int maxwarn = 10; /* * Opportunistically attempt to initialize the RNG on platforms that * have fast cycle counters, but don't (for now) require it to succeed. */ if (!crng_ready()) try_to_generate_entropy(); if (!crng_ready()) { if (!ratelimit_disable && maxwarn <= 0) ++urandom_warning.missed; else if (ratelimit_disable || __ratelimit(&urandom_warning)) { --maxwarn; pr_notice("%s: uninitialized urandom read (%zu bytes read)\n", current->comm, iov_iter_count(iter)); } } return get_random_bytes_user(iter); } static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter) { int ret; if (!crng_ready() && ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) || (kiocb->ki_filp->f_flags & O_NONBLOCK))) return -EAGAIN; ret = wait_for_random_bytes(); if (ret != 0) return ret; return get_random_bytes_user(iter); } static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; int ent_count; switch (cmd) { case RNDGETENTCNT: /* Inherently racy, no point locking. */ if (put_user(input_pool.init_bits, p)) return -EFAULT; return 0; case RNDADDTOENTCNT: if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ent_count, p)) return -EFAULT; if (ent_count < 0) return -EINVAL; credit_init_bits(ent_count); return 0; case RNDADDENTROPY: { struct iov_iter iter; ssize_t ret; int len; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ent_count, p++)) return -EFAULT; if (ent_count < 0) return -EINVAL; if (get_user(len, p++)) return -EFAULT; ret = import_ubuf(ITER_SOURCE, p, len, &iter); if (unlikely(ret)) return ret; ret = write_pool_user(&iter); if (unlikely(ret < 0)) return ret; /* Since we're crediting, enforce that it was all written into the pool. */ if (unlikely(ret != len)) return -EFAULT; credit_init_bits(ent_count); return 0; } case RNDZAPENTCNT: case RNDCLEARPOOL: /* No longer has any effect. */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 0; case RNDRESEEDCRNG: if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!crng_ready()) return -ENODATA; crng_reseed(NULL); return 0; default: return -EINVAL; } } static int random_fasync(int fd, struct file *filp, int on) { return fasync_helper(fd, filp, on, &fasync); } const struct file_operations random_fops = { .read_iter = random_read_iter, .write_iter = random_write_iter, .poll = random_poll, .unlocked_ioctl = random_ioctl, .compat_ioctl = compat_ptr_ioctl, .fasync = random_fasync, .llseek = noop_llseek, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; const struct file_operations urandom_fops = { .read_iter = urandom_read_iter, .write_iter = random_write_iter, .unlocked_ioctl = random_ioctl, .compat_ioctl = compat_ptr_ioctl, .fasync = random_fasync, .llseek = noop_llseek, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /******************************************************************** * * Sysctl interface. * * These are partly unused legacy knobs with dummy values to not break * userspace and partly still useful things. They are usually accessible * in /proc/sys/kernel/random/ and are as follows: * * - boot_id - a UUID representing the current boot. * * - uuid - a random UUID, different each time the file is read. * * - poolsize - the number of bits of entropy that the input pool can * hold, tied to the POOL_BITS constant. * * - entropy_avail - the number of bits of entropy currently in the * input pool. Always <= poolsize. * * - write_wakeup_threshold - the amount of entropy in the input pool * below which write polls to /dev/random will unblock, requesting * more entropy, tied to the POOL_READY_BITS constant. It is writable * to avoid breaking old userspaces, but writing to it does not * change any behavior of the RNG. * * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL. * It is writable to avoid breaking old userspaces, but writing * to it does not change any behavior of the RNG. * ********************************************************************/ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ; static int sysctl_random_write_wakeup_bits = POOL_READY_BITS; static int sysctl_poolsize = POOL_BITS; static u8 sysctl_bootid[UUID_SIZE]; /* * This function is used to return both the bootid UUID, and random * UUID. The difference is in whether table->data is NULL; if it is, * then a new UUID is generated and returned to the user. */ static int proc_do_uuid(const struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos) { u8 tmp_uuid[UUID_SIZE], *uuid; char uuid_string[UUID_STRING_LEN + 1]; struct ctl_table fake_table = { .data = uuid_string, .maxlen = UUID_STRING_LEN }; if (write) return -EPERM; uuid = table->data; if (!uuid) { uuid = tmp_uuid; generate_random_uuid(uuid); } else { static DEFINE_SPINLOCK(bootid_spinlock); spin_lock(&bootid_spinlock); if (!uuid[8]) generate_random_uuid(uuid); spin_unlock(&bootid_spinlock); } snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid); return proc_dostring(&fake_table, 0, buf, lenp, ppos); } /* The same as proc_dointvec, but writes don't change anything. */ static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos) { return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos); } static struct ctl_table random_table[] = { { .procname = "poolsize", .data = &sysctl_poolsize, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "entropy_avail", .data = &input_pool.init_bits, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "write_wakeup_threshold", .data = &sysctl_random_write_wakeup_bits, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_rointvec, }, { .procname = "urandom_min_reseed_secs", .data = &sysctl_random_min_urandom_seed, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_rointvec, }, { .procname = "boot_id", .data = &sysctl_bootid, .mode = 0444, .proc_handler = proc_do_uuid, }, { .procname = "uuid", .mode = 0444, .proc_handler = proc_do_uuid, }, }; /* * random_init() is called before sysctl_init(), * so we cannot call register_sysctl_init() in random_init() */ static int __init random_sysctls_init(void) { register_sysctl_init("kernel/random", random_table); return 0; } device_initcall(random_sysctls_init); #endif |
4 3 4 8 2 4 2 1 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 * * Copyright (C)2003 USAGI/WIDE Project * * Author Mitsuru KANDA <mk@linux-ipv6.org> */ /* * [Memo] * * Outbound: * The compression of IP datagram MUST be done before AH/ESP processing, * fragmentation, and the addition of Hop-by-Hop/Routing header. * * Inbound: * The decompression of IP datagram MUST be done after the reassembly, * AH/ESP processing. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipcomp.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/pfkeyv2.h> #include <linux/random.h> #include <linux/percpu.h> #include <linux/smp.h> #include <linux/list.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); __be32 spi; const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct ip_comp_hdr *ipcomph = (struct ip_comp_hdr *)(skb->data + offset); struct xfrm_state *x; if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; spi = htonl(ntohs(ipcomph->cpi)); x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); if (!x) return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); struct xfrm_state *t = NULL; t = xfrm_state_alloc(net); if (!t) goto out; t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); if (!t->id.spi) goto error; memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); memcpy(&t->sel, &x->sel, sizeof(t->sel)); t->props.family = AF_INET6; t->props.mode = x->props.mode; memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); memcpy(&t->mark, &x->mark, sizeof(t->mark)); t->if_id = x->if_id; if (xfrm_init_state(t)) goto error; atomic_set(&t->tunnel_users, 1); out: return t; error: t->km.state = XFRM_STATE_DEAD; xfrm_state_put(t); t = NULL; goto out; } static int ipcomp6_tunnel_attach(struct xfrm_state *x) { struct net *net = xs_net(x); int err = 0; struct xfrm_state *t = NULL; __be32 spi; u32 mark = x->mark.m & x->mark.v; spi = xfrm6_tunnel_spi_lookup(net, (xfrm_address_t *)&x->props.saddr); if (spi) t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr, spi, IPPROTO_IPV6, AF_INET6); if (!t) { t = ipcomp6_tunnel_create(x); if (!t) { err = -EINVAL; goto out; } xfrm_state_insert(t); xfrm_state_hold(t); } x->tunnel = t; atomic_inc(&t->tunnel_users); out: return err; } static int ipcomp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err = -EINVAL; x->props.header_len = 0; switch (x->props.mode) { case XFRM_MODE_TRANSPORT: break; case XFRM_MODE_TUNNEL: x->props.header_len += sizeof(struct ipv6hdr); break; default: NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp"); goto out; } err = ipcomp_init_state(x, extack); if (err) goto out; if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp6_tunnel_attach(x); if (err) { NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state"); goto out; } } err = 0; out: return err; } static int ipcomp6_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ipcomp6_type = { .owner = THIS_MODULE, .proto = IPPROTO_COMP, .init_state = ipcomp6_init_state, .destructor = ipcomp_destroy, .input = ipcomp_input, .output = ipcomp_output, }; static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, .priority = 0, }; static int __init ipcomp6_init(void) { if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm6_protocol_register(&ipcomp6_protocol, IPPROTO_COMP) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ipcomp6_type, AF_INET6); return -EAGAIN; } return 0; } static void __exit ipcomp6_fini(void) { if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ipcomp6_type, AF_INET6); } module_init(ipcomp6_init); module_exit(ipcomp6_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); MODULE_AUTHOR("Mitsuru KANDA <mk@linux-ipv6.org>"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_COMP); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } static __always_inline void cpu_relax(void) { rep_nop(); } struct getcpu_cache; notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */ |
7 1 1 1 4 1 3 3 3 1 1 1 1 2 1 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_connmark.c netfilter connmark retriever action * skb mark is over-written * * Copyright (c) 2011 Felix Fietkau <nbd@openwrt.org> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <uapi/linux/tc_act/tc_connmark.h> #include <net/tc_act/tc_connmark.h> #include <net/tc_wrapper.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_connmark_ops; TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = to_connmark(a); struct tcf_connmark_parms *parms; struct nf_conntrack_zone zone; struct nf_conn *c; int proto; tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); parms = rcu_dereference_bh(ca->parms); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; break; case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; break; default: goto out; } c = nf_ct_get(skb, &ctinfo); if (c) { skb->mark = READ_ONCE(c->mark); goto count; } if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net, &tuple)) goto out; zone.id = parms->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(parms->net, &zone, &tuple); if (!thash) goto out; c = nf_ct_tuplehash_to_ctrack(thash); skb->mark = READ_ONCE(c->mark); nf_ct_put(c); count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: return READ_ONCE(ca->tcf_action); } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { [TCA_CONNMARK_PARMS] = { .len = sizeof(struct tc_connmark) }, }; static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); struct tcf_connmark_parms *nparms, *oparms; struct nlattr *tb[TCA_CONNMARK_MAX + 1]; bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_connmark_info *ci; struct tc_connmark *parm; int ret = 0, err; u32 index; if (!nla) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_CONNMARK_MAX, nla, connmark_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_CONNMARK_PARMS]) return -EINVAL; nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) return -ENOMEM; parm = nla_data(tb[TCA_CONNMARK_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_connmark_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); err = ret; goto out_free; } ci = to_connmark(*a); nparms->net = net; nparms->zone = parm->zone; ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); if (bind) { err = ACT_P_BOUND; goto out_free; } if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } nparms->net = rtnl_dereference(ci->parms)->net; nparms->zone = parm->zone; ret = 0; } else { err = ret; goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); spin_unlock_bh(&ci->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (oparms) kfree_rcu(oparms, rcu); return ret; release_idr: tcf_idr_release(*a, bind); out_free: kfree(nparms); return err; } static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_connmark_info *ci = to_connmark(a); struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; struct tcf_connmark_parms *parms; struct tcf_t t; spin_lock_bh(&ci->tcf_lock); parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); opt.action = ci->tcf_action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; spin_unlock_bh(&ci->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&ci->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_connmark_cleanup(struct tc_action *a) { struct tcf_connmark_info *ci = to_connmark(a); struct tcf_connmark_parms *parms; parms = rcu_dereference_protected(ci->parms, 1); if (parms) kfree_rcu(parms, rcu); } static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .id = TCA_ID_CONNMARK, .owner = THIS_MODULE, .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; MODULE_ALIAS_NET_ACT("connmark"); static __net_init int connmark_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); return tc_action_net_init(net, tn, &act_connmark_ops); } static void __net_exit connmark_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_connmark_ops.net_id); } static struct pernet_operations connmark_net_ops = { .init = connmark_init_net, .exit_batch = connmark_exit_net, .id = &act_connmark_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init connmark_init_module(void) { return tcf_register_action(&act_connmark_ops, &connmark_net_ops); } static void __exit connmark_cleanup_module(void) { tcf_unregister_action(&act_connmark_ops, &connmark_net_ops); } module_init(connmark_init_module); module_exit(connmark_cleanup_module); MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>"); MODULE_DESCRIPTION("Connection tracking mark restoring"); MODULE_LICENSE("GPL"); |
280 124 196 194 195 195 195 1 1 1 1 1 108 107 108 107 2 107 99 10 108 107 107 105 107 107 29 3 83 79 32 104 1 1 107 3 105 105 105 3 107 107 106 107 107 52 94 83 94 62 81 88 16 61 61 28 40 42 37 1 2 10 1 10 9 60 57 95 94 56 10 103 105 55 94 14 14 14 14 14 10 4 13 27 28 28 6 4 134 23 127 36 18 28 5 5 105 144 7 89 11 115 98 62 142 119 68 109 182 126 111 6 1 9 7 2 4 4 9 136 9 127 156 147 10 10 10 10 10 28 37 15 4 1 19 5 6 6 25 108 108 10 1 3 5 1 9 4 1 3 1 1 1 3 178 173 47 12 3 4 6 131 4 123 1 5 5 2 3 149 32 95 1 32 32 32 113 113 16 1 15 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2002, 2004 * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * Copyright (c) 2002-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * SCTP over IPv6. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Le Yanqun <yanqun.le@nokia.com> * Hui Huang <hui.huang@nokia.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * * Based on: * linux/net/ipv6/tcp_ipv6.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/ipsec.h> #include <linux/slab.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/seq_file.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> #include <net/udp_tunnel.h> #include <linux/uaccess.h> static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2); static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port); static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2); /* Event handler for inet6 address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->idev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr) && addr->a.v6.sin6_scope_id == ifa->idev->dev->ifindex) { found = 1; addr->valid = 0; list_del_rcu(&addr->list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; static void sctp_v6_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) sctp_icmp_frag_needed(sk, asoc, t, info); return; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } break; case NDISC_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: break; } icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { sk->sk_err = err; sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } } /* ICMP error handler. */ static int sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, offset); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original pointers. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } sctp_v6_err_handle(transport, skb, type, code, ntohl(info)); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v6_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmp6hdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmp6hdr *)(skb_network_header(skb) - sizeof(struct icmp6hdr)); if (hdr->icmp6_type == NDISC_REDIRECT) { /* can't be handled without outer ip6hdr known, leave it to udpv6_err */ sctp_err_finish(sk, t); return 0; } if (hdr->icmp6_type == ICMPV6_PKT_TOOBIG) info = ntohl(hdr->icmp6_mtu); sctp_v6_err_handle(t, skb, hdr->icmp6_type, hdr->icmp6_code, info); sctp_err_finish(sk, t); return 1; } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); struct flowi6 *fl6 = &t->fl.u.ip6; struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); __u8 tclass = np->tclass; __be32 label; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); if (t->dscp & SCTP_DSCP_SET_MASK) tclass = t->dscp & SCTP_DSCP_VAL_MASK; if (INET_ECN_is_capable(tclass)) IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(t->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { int res; skb_dst_set(skb, dst); rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; } if (skb_is_gso(skb)) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; skb->encapsulation = 1; skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, sctp_sk(sk)->udp_port, t->encap_port, false); } /* Returns the dst cache entry for the given source and destination ip * addresses. */ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct dst_entry *dst = NULL; struct flowi _fl; struct flowi6 *fl6 = &_fl.u.ip6; struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; enum sctp_scope scope; __u8 matchlen = 0; memset(&_fl, 0, sizeof(_fl)); fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; if (t->flowlabel & SCTP_FLOWLABEL_SET_MASK) fl6->flowlabel = htonl(t->flowlabel & SCTP_FLOWLABEL_VAL_MASK); if (inet6_test_bit(SNDFLOW, sk) && (fl6->flowlabel & IPV6_FLOWLABEL_MASK)) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) goto out; fl6_sock_release(flowlabel); } pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); if (asoc) fl6->fl6_sport = htons(asoc->base.bind_addr.port); if (saddr) { fl6->saddr = saddr->v6.sin6_addr; if (!fl6->fl6_sport) fl6->fl6_sport = saddr->v6.sin6_port; pr_debug("src=%pI6 - ", &fl6->saddr); } rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) { t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } bp = &asoc->base.bind_addr; scope = sctp_scope(daddr); /* ip6_dst_lookup has filled in the fl6->saddr for us. Check * to see if we can use it. */ if (!IS_ERR(dst)) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || laddr->state == SCTP_ADDR_DEL || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; /* Do not compare against v4 addrs */ if ((laddr->a.sa.sa_family == AF_INET6) && (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { rcu_read_unlock(); t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get the * best source address for a given destination. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct dst_entry *bdst; __u8 bmatchlen; if (!laddr->valid || laddr->state != SCTP_ADDR_SRC || laddr->a.sa.sa_family != AF_INET6 || scope > sctp_scope(&laddr->a)) continue; fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; if (ipv6_chk_addr(dev_net(bdst->dev), &laddr->a.v6.sin6_addr, bdst->dev, 1)) { if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); if (matchlen > bmatchlen) { dst_release(bdst); continue; } if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; matchlen = bmatchlen; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; rt = dst_rt6_info(dst); t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl->u.ip6.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* Returns the number of consecutive initial bits that match in the 2 ipv6 * addresses. */ static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2) { return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr); } /* Fills in the source address(saddr) based on the destination address(daddr) * and asoc's bind address list. */ static void sctp_v6_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { struct flowi6 *fl6 = &fl->u.ip6; union sctp_addr *saddr = &t->saddr; pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst); if (t->dst) { saddr->v6.sin6_family = AF_INET6; saddr->v6.sin6_addr = fl6->saddr; } } /* Make a copy of all potential local addresses. */ static void sctp_v6_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct inet6_dev *in6_dev; struct inet6_ifaddr *ifp; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in6_dev = __in6_dev_get(dev)) == NULL) { rcu_read_unlock(); return; } read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); } /* Copy over any ip options */ static void sctp_v6_copy_ip_options(struct sock *sk, struct sock *newsk) { struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct ipv6_txoptions *opt; newnp = inet6_sk(newsk); rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); if (!opt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); } /* Account for the IP options */ static int sctp_v6_ip_options_len(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; int len = 0; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) len = opt->opt_flen + opt->opt_nflen; rcu_read_unlock(); return len; } /* Initialize a sockaddr_storage from in incoming skb. */ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in6 *sa = &addr->v6; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; if (is_saddr) { sa->sin6_port = sh->source; sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { sa->sin6_port = sh->dest; sa->sin6_addr = ipv6_hdr(skb)->daddr; } } /* Initialize an sctp_addr from a socket. */ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_rcv_saddr.s6_addr32[0] = 0; sk->sk_v6_rcv_saddr.s6_addr32[1] = 0; sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_rcv_saddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_rcv_saddr = addr->v6.sin6_addr; } } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_daddr.s6_addr32[0] = 0; sk->sk_v6_daddr.s6_addr32[1] = 0; sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_daddr = addr->v6.sin6_addr; } } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) return false; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v6_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv6addr_param); param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); param->v6.addr = addr->v6.sin6_addr; return length; } /* Initialize a sctp_addr from struct in6_addr. */ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port) { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; addr->v6.sin6_scope_id = 0; } static int __sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) { if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr2->v6.sin6_addr) && addr2->v6.sin6_addr.s6_addr32[3] == addr1->v4.sin_addr.s_addr) return 1; if (addr2->sa.sa_family == AF_INET && addr1->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr1->v6.sin6_addr) && addr1->v6.sin6_addr.s6_addr32[3] == addr2->v4.sin_addr.s_addr) return 1; return 0; } if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) && addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id) return 0; return 1; } /* Compare addresses exactly. * v4-mapped-v6 is also in consideration. */ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { return __sctp_v6_cmp_addr(addr1, addr2) && addr1->v6.sin6_port == addr2->v6.sin6_port; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port) { memset(addr, 0x00, sizeof(union sctp_addr)); addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; } /* Is this a wildcard address? */ static int sctp_v6_is_any(const union sctp_addr *addr) { return ipv6_addr_any(&addr->v6.sin6_addr); } /* Should this be available for binding? */ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); struct net_device *dev = NULL; int type, res, bound_dev_if; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) return 1; if (type == IPV6_ADDR_MAPPED) { if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->available(addr, sp); } if (!(type & IPV6_ADDR_UNICAST)) return 0; rcu_read_lock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if) { res = 0; dev = dev_get_by_index_rcu(net, bound_dev_if); if (!dev) goto out; } res = ipv6_can_nonlocal_bind(net, &sp->inet) || ipv6_chk_addr(net, in6, dev, 0); out: rcu_read_unlock(); return res; } /* This function checks if the address is a valid address to be used for * SCTP. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); /* Support v4-mapped-v6 address. */ if (ret == IPV6_ADDR_MAPPED) { /* Note: This routine is used in input, so v4-mapped-v6 * are disallowed here when there is no sctp_sock. */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ if (!(ret & IPV6_ADDR_UNICAST)) return 0; return 1; } /* What is the scope of 'addr'? */ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) { enum sctp_scope retval; int v6scope; /* The IPv6 scope is really a set of bit fields. * See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope. */ v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); switch (v6scope) { case IFA_HOST: retval = SCTP_SCOPE_LOOPBACK; break; case IFA_LINK: retval = SCTP_SCOPE_LINK; break; case IFA_SITE: retval = SCTP_SCOPE_PRIVATE; break; default: retval = SCTP_SCOPE_GLOBAL; break; } return retval; } /* Create and initialize a new sk for the socket to be returned by accept(). */ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; sctp_v6_copy_ip_options(sk, newsk); /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { if (sp->v4mapped) { if (addr->sa.sa_family == AF_INET) sctp_v4_map_v6(addr); } else { if (addr->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr->v6.sin6_addr)) sctp_v6_map_v4(addr); } if (addr->sa.sa_family == AF_INET) { memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } return sizeof(struct sockaddr_in6); } /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { return inet6_iif(skb); } static int sctp_v6_skb_sdif(const struct sk_buff *skb) { return inet6_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr); } static void sctp_v6_ecn_capable(struct sock *sk) { inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } /* Initialize a PF_INET msgname from a ulpevent. */ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addrlen) { union sctp_addr *addr; struct sctp_association *asoc; union sctp_addr *paddr; if (!msgname) return; addr = (union sctp_addr *)msgname; asoc = event->asoc; paddr = &asoc->peer.primary_addr; if (paddr->sa.sa_family == AF_INET) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = htons(asoc->peer.port); addr->v4.sin_addr = paddr->v4.sin_addr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id; else addr->v6.sin6_scope_id = 0; addr->v6.sin6_port = htons(asoc->peer.port); addr->v6.sin6_addr = paddr->v6.sin6_addr; } *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr); } /* Initialize a msg_name from an inbound skb. */ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, int *addr_len) { union sctp_addr *addr; struct sctphdr *sh; if (!msgname) return; addr = (union sctp_addr *)msgname; sh = sctp_hdr(skb); if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); else addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); } /* Do we support this AF? */ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) { switch (family) { case AF_INET6: return 1; /* v4-mapped-v6 addresses */ case AF_INET: if (!ipv6_only_sock(sctp_opt2sk(sp))) return 1; fallthrough; default: return 0; } } /* Address matching with wildcards allowed. This extra level * of indirection lets us choose whether a PF_INET6 should * disallow any v4 addresses if we so choose. */ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { struct sock *sk = sctp_opt2sk(opt); struct sctp_af *af1, *af2; af1 = sctp_get_af_specific(addr1->sa.sa_family); af2 = sctp_get_af_specific(addr2->sa.sa_family); if (!af1 || !af2) return 0; /* If the socket is IPv6 only, v4 addrs will not match */ if (ipv6_only_sock(sk) && af1 != af2) return 0; /* Today, wildcard AF_INET/AF_INET6. */ if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) return 1; if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET) return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr; return __sctp_v6_cmp_addr(addr1, addr2); } /* Verify that the provided sockaddr looks bindable. Common verification, * has already been taken care of. */ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { struct net *net; if (!addr->v6.sin6_scope_id) return 0; net = sock_net(&opt->inet.sk); rcu_read_lock(); dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id); if (!dev || !(ipv6_can_nonlocal_bind(net, &opt->inet) || ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0))) { rcu_read_unlock(); return 0; } rcu_read_unlock(); } af = opt->pf->af; } return af->available(addr, opt); } /* Verify that the provided sockaddr looks sendable. Common verification, * has already been taken care of. */ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af = NULL; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { if (!addr->v6.sin6_scope_id) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk), addr->v6.sin6_scope_id); rcu_read_unlock(); if (!dev) return 0; } af = opt->pf->af; } return af != NULL; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Note: In the future, we may want to look at sock options * to determine whether a PF_INET6 socket really wants to have IPV4 * addresses. * Returns number of addresses supported. */ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV6_ADDRESS; if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) { types[1] = SCTP_PARAM_IPV4_ADDRESS; return 2; } return 1; } /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int rc; rc = inet6_getname(sock, uaddr, peer); if (rc < 0) return rc; rc = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; } static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw sctpv6_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG, }; static int sctp6_rcv(struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb) ? -1 : 0; } static const struct inet6_protocol sctpv6_protocol = { .handler = sctp6_rcv, .err_handler = sctp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static struct sctp_af sctp_af_inet6 = { .sa_family = AF_INET6, .sctp_xmit = sctp_v6_xmit, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .get_dst = sctp_v6_get_dst, .get_saddr = sctp_v6_get_saddr, .copy_addrlist = sctp_v6_copy_addrlist, .from_skb = sctp_v6_from_skb, .from_sk = sctp_v6_from_sk, .from_addr_param = sctp_v6_from_addr_param, .to_addr_param = sctp_v6_to_addr_param, .cmp_addr = sctp_v6_cmp_addr, .scope = sctp_v6_scope, .addr_valid = sctp_v6_addr_valid, .inaddr_any = sctp_v6_inaddr_any, .is_any = sctp_v6_is_any, .available = sctp_v6_available, .skb_iif = sctp_v6_skb_iif, .skb_sdif = sctp_v6_skb_sdif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, .ecn_capable = sctp_v6_ecn_capable, .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), .ip_options_len = sctp_v6_ip_options_len, }; static struct sctp_pf sctp_pf_inet6 = { .event_msgname = sctp_inet6_event_msgname, .skb_msgname = sctp_inet6_skb_msgname, .af_supported = sctp_inet6_af_supported, .cmp_addr = sctp_inet6_cmp_addr, .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, .copy_ip_options = sctp_v6_copy_ip_options, .af = &sctp_af_inet6, }; /* Initialize IPv6 support and register with socket layer. */ void sctp_v6_pf_init(void) { /* Register the SCTP specific PF_INET6 functions. */ sctp_register_pf(&sctp_pf_inet6, PF_INET6); /* Register the SCTP specific AF_INET6 functions. */ sctp_register_af(&sctp_af_inet6); } void sctp_v6_pf_exit(void) { list_del(&sctp_af_inet6.list); } /* Initialize IPv6 support and register with socket layer. */ int sctp_v6_protosw_init(void) { int rc; rc = proto_register(&sctpv6_prot, 1); if (rc) return rc; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); inet6_register_protosw(&sctpv6_stream_protosw); return 0; } void sctp_v6_protosw_exit(void) { inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); proto_unregister(&sctpv6_prot); } /* Register with inet6 layer. */ int sctp_v6_add_protocol(void) { /* Register notifier for inet6 address additions/deletions. */ register_inet6addr_notifier(&sctp_inet6addr_notifier); if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } /* Unregister with inet6 layer. */ void sctp_v6_del_protocol(void) { inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); unregister_inet6addr_notifier(&sctp_inet6addr_notifier); } |
13 16 10 6 1 10 16 13 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_highmem(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */ |
4 4 4 4 13 13 13 1 13 13 15 16 16 6 6 6 16 1 8 1 2 13 13 1 12 1 1 10 12 9 1 3 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_basic.c Basic Packet Classifier. * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/idr.h> #include <linux/percpu.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> struct basic_head { struct list_head flist; struct idr handle_idr; struct rcu_head rcu; }; struct basic_filter { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; struct tcf_result res; struct tcf_proto *tp; struct list_head link; struct tc_basic_pcnt __percpu *pf; struct rcu_work rwork; }; TC_INDIRECT_SCOPE int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { int r; struct basic_head *head = rcu_dereference_bh(tp->root); struct basic_filter *f; list_for_each_entry_rcu(f, &head->flist, link) { __this_cpu_inc(f->pf->rcnt); if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; __this_cpu_inc(f->pf->rhit); *res = f->res; r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) continue; return r; } return -1; } static void *basic_get(struct tcf_proto *tp, u32 handle) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { if (f->handle == handle) { return f; } } return NULL; } static int basic_init(struct tcf_proto *tp) { struct basic_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD(&head->flist); idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } static void __basic_delete_filter(struct basic_filter *f) { tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); free_percpu(f->pf); kfree(f); } static void basic_delete_filter_work(struct work_struct *work) { struct basic_filter *f = container_of(to_rcu_work(work), struct basic_filter, rwork); rtnl_lock(); __basic_delete_filter(f); rtnl_unlock(); } static void basic_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); if (tcf_exts_get_net(&f->exts)) tcf_queue_work(&f->rwork, basic_delete_filter_work); else __basic_delete_filter(f); } idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } static int basic_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f = arg; list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); idr_remove(&head->handle_idr, f->handle); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, basic_delete_filter_work); *last = list_empty(&head->flist); return 0; } static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { [TCA_BASIC_CLASSID] = { .type = NLA_U32 }, [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, }; static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &f->ematches); if (err < 0) return err; if (tb[TCA_BASIC_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]); tcf_bind_filter(tp, &f->res, base); } f->tp = tp; return 0; } static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_BASIC_MAX + 1]; struct basic_filter *fold = (struct basic_filter *) *arg; struct basic_filter *fnew; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], basic_policy, NULL); if (err < 0) return err; if (fold != NULL) { if (handle && fold->handle != handle) return -EINVAL; } fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); if (!fnew) return -ENOBUFS; err = tcf_exts_init(&fnew->exts, net, TCA_BASIC_ACT, TCA_BASIC_POLICE); if (err < 0) goto errout; if (!handle) { handle = 1; err = idr_alloc_u32(&head->handle_idr, fnew, &handle, INT_MAX, GFP_KERNEL); } else if (!fold) { err = idr_alloc_u32(&head->handle_idr, fnew, &handle, handle, GFP_KERNEL); } if (err) goto errout; fnew->handle = handle; fnew->pf = alloc_percpu(struct tc_basic_pcnt); if (!fnew->pf) { err = -ENOMEM; goto errout; } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags, extack); if (err < 0) { if (!fold) idr_remove(&head->handle_idr, fnew->handle); goto errout; } *arg = fnew; if (fold) { idr_replace(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->link, &fnew->link); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); tcf_queue_work(&fold->rwork, basic_delete_filter_work); } else { list_add_rcu(&fnew->link, &head->flist); } return 0; errout: free_percpu(fnew->pf); tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; } static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { if (!tc_cls_stats_dump(tp, arg, f)) break; } } static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct basic_filter *f = fh; tc_cls_bind_class(classid, cl, q, &f->res, base); } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct tc_basic_pcnt gpf = {}; struct basic_filter *f = fh; struct nlattr *nest; int cpu; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (f->res.classid && nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; for_each_possible_cpu(cpu) { struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu); gpf.rcnt += pf->rcnt; gpf.rhit += pf->rhit; } if (nla_put_64bit(skb, TCA_BASIC_PCNT, sizeof(struct tc_basic_pcnt), &gpf, TCA_BASIC_PAD)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct tcf_proto_ops cls_basic_ops __read_mostly = { .kind = "basic", .classify = basic_classify, .init = basic_init, .destroy = basic_destroy, .get = basic_get, .change = basic_change, .delete = basic_delete, .walk = basic_walk, .dump = basic_dump, .bind_class = basic_bind_class, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("basic"); static int __init init_basic(void) { return register_tcf_proto_ops(&cls_basic_ops); } static void __exit exit_basic(void) { unregister_tcf_proto_ops(&cls_basic_ops); } module_init(init_basic) module_exit(exit_basic) MODULE_DESCRIPTION("TC basic classifier"); MODULE_LICENSE("GPL"); |
1 1 153 105 56 106 112 106 72 74 63 154 171 10 10 9 165 1 156 10 164 10 9 164 1 5 4 10 1 5 8 1 2 5 77 69 7 81 80 113 172 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; } |
80 10 1 79 80 78 11 8 77 1 76 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include "internal.h" static const struct proc_ns_operations *ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif #ifdef CONFIG_UTS_NS &utsns_operations, #endif #ifdef CONFIG_IPC_NS &ipcns_operations, #endif #ifdef CONFIG_PID_NS &pidns_operations, &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, #endif &mntns_operations, #ifdef CONFIG_CGROUPS &cgroupns_operations, #endif #ifdef CONFIG_TIME_NS &timens_operations, &timens_for_children_operations, #endif }; static const char *proc_ns_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; int error = -EACCES; if (!dentry) return ERR_PTR(-ECHILD); task = get_proc_task(inode); if (!task) return ERR_PTR(-EACCES); if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) goto out; error = ns_get_path(&ns_path, task, ns_ops); if (error) goto out; error = nd_jump_link(&ns_path); out: put_task_struct(task); return ERR_PTR(error); } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) return res; if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { res = ns_get_name(name, sizeof(name), task, ns_ops); if (res >= 0) res = readlink_copy(buffer, buflen, name); } put_task_struct(task); return res; } static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, .get_link = proc_ns_get_link, .setattr = proc_setattr, }; static struct dentry *proc_ns_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { const struct proc_ns_operations *ns_ops = ptr; struct inode *inode; struct proc_inode *ei; inode = proc_pid_make_inode(dentry->d_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) return ERR_PTR(-ENOENT); ei = PROC_I(inode); inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; pid_update_inode(task, inode); d_set_d_op(dentry, &pid_dentry_operations); return d_splice_alias(inode, dentry); } static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); const struct proc_ns_operations **entry, **last; if (!task) return -ENOENT; if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos >= 2 + ARRAY_SIZE(ns_entries)) goto out; entry = ns_entries + (ctx->pos - 2); last = &ns_entries[ARRAY_SIZE(ns_entries) - 1]; while (entry <= last) { const struct proc_ns_operations *ops = *entry; if (!proc_fill_cache(file, ctx, ops->name, strlen(ops->name), proc_ns_instantiate, task, ops)) break; ctx->pos++; entry++; } out: put_task_struct(task); return 0; } const struct file_operations proc_ns_dir_operations = { .read = generic_read_dir, .iterate_shared = proc_ns_dir_readdir, .llseek = generic_file_llseek, }; static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); const struct proc_ns_operations **entry, **last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); if (!task) goto out_no_task; last = &ns_entries[ARRAY_SIZE(ns_entries)]; for (entry = ns_entries; entry < last; entry++) { if (strlen((*entry)->name) != len) continue; if (!memcmp(dentry->d_name.name, (*entry)->name, len)) break; } if (entry == last) goto out; res = proc_ns_instantiate(dentry, task, *entry); out: put_task_struct(task); out_no_task: return res; } const struct inode_operations proc_ns_dir_inode_operations = { .lookup = proc_ns_dir_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; |
1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC packet reception * * Copyright (C) 2007, 2016, 2022 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include "ar-internal.h" static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb); /* * handle data received on the local endpoint * - may be called in interrupt context * * [!] Note that as this is called from the encap_rcv hook, the socket is not * held locked by the caller and nothing prevents sk_user_data on the UDP from * being cleared in the middle of processing this function. * * Called with the RCU read lock held from the IP layer via UDP. */ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) { struct sk_buff_head *rx_queue; struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct task_struct *io_thread; if (unlikely(!local)) { kfree_skb(skb); return 0; } io_thread = READ_ONCE(local->io_thread); if (!io_thread) { kfree_skb(skb); return 0; } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); skb->mark = RXRPC_SKB_MARK_PACKET; rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); rx_queue = &local->rx_queue; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY if (rxrpc_inject_rx_delay || !skb_queue_empty(&local->rx_delay_queue)) { skb->tstamp = ktime_add_ms(skb->tstamp, rxrpc_inject_rx_delay); rx_queue = &local->rx_delay_queue; } #endif skb_queue_tail(rx_queue, skb); wake_up_process(io_thread); return 0; } /* * Handle an error received on the local endpoint. */ void rxrpc_error_report(struct sock *sk) { struct rxrpc_local *local; struct sk_buff *skb; rcu_read_lock(); local = rcu_dereference_sk_user_data(sk); if (unlikely(!local)) { rcu_read_unlock(); return; } while ((skb = skb_dequeue(&sk->sk_error_queue))) { skb->mark = RXRPC_SKB_MARK_ERROR; rxrpc_new_skb(skb, rxrpc_skb_new_error_report); skb_queue_tail(&local->rx_queue, skb); } rxrpc_wake_up_io_thread(local); rcu_read_unlock(); } /* * Directly produce an abort from a packet. */ bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, abort_code, err); skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; skb->priority = abort_code; return false; } static bool rxrpc_bad_message(struct sk_buff *skb, enum rxrpc_abort_reason why) { return rxrpc_direct_abort(skb, why, RX_PROTOCOL_ERROR, -EBADMSG); } #define just_discard true /* * Process event packets targeted at a local endpoint. */ static bool rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); char v; _enter(""); rxrpc_see_skb(skb, rxrpc_skb_see_version); if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &v, 1) >= 0) { if (v == 0) rxrpc_send_version_request(local, &sp->hdr, skb); } return true; } /* * Extract the wire header from a packet and translate the byte order. */ static bool rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) { struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; /* dig out the RxRPC connection details */ if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) return rxrpc_bad_message(skb, rxrpc_badmsg_short_hdr); memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); sp->hdr.cid = ntohl(whdr.cid); sp->hdr.callNumber = ntohl(whdr.callNumber); sp->hdr.seq = ntohl(whdr.seq); sp->hdr.serial = ntohl(whdr.serial); sp->hdr.flags = whdr.flags; sp->hdr.type = whdr.type; sp->hdr.userStatus = whdr.userStatus; sp->hdr.securityIndex = whdr.securityIndex; sp->hdr._rsvd = ntohs(whdr._rsvd); sp->hdr.serviceId = ntohs(whdr.serviceId); if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) { if (skb_copy_bits(skb, sizeof(whdr), &ack, sizeof(ack)) < 0) return rxrpc_bad_message(skb, rxrpc_badmsg_short_ack); sp->ack.first_ack = ntohl(ack.firstPacket); sp->ack.prev_ack = ntohl(ack.previousPacket); sp->ack.acked_serial = ntohl(ack.serial); sp->ack.reason = ack.reason; sp->ack.nr_acks = ack.nAcks; } return true; } /* * Extract the abort code from an ABORT packet and stash it in skb->priority. */ static bool rxrpc_extract_abort(struct sk_buff *skb) { __be32 wtmp; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &wtmp, sizeof(wtmp)) < 0) return false; skb->priority = ntohl(wtmp); return true; } /* * Process packets received on the local endpoint */ static bool rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; struct sk_buff *skb = *_skb; bool ret = false; skb_pull(skb, sizeof(struct udphdr)); sp = rxrpc_skb(skb); /* dig out the RxRPC connection details */ if (!rxrpc_extract_header(sp, skb)) return just_discard; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); return just_discard; } } trace_rxrpc_rx_packet(sp); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) return just_discard; return rxrpc_input_version(local, skb); case RXRPC_PACKET_TYPE_BUSY: if (rxrpc_to_server(sp)) return just_discard; fallthrough; case RXRPC_PACKET_TYPE_ACK: case RXRPC_PACKET_TYPE_ACKALL: if (sp->hdr.callNumber == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); break; case RXRPC_PACKET_TYPE_ABORT: if (!rxrpc_extract_abort(skb)) return just_discard; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: if (sp->hdr.callNumber == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_call); if (sp->hdr.seq == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_seq); /* Unshare the packet so that it can be modified for in-place * decryption. */ if (sp->hdr.securityIndex != 0) { skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); *_skb = NULL; return just_discard; } if (skb != *_skb) { rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare); *_skb = skb; rxrpc_new_skb(skb, rxrpc_skb_new_unshared); sp = rxrpc_skb(skb); } } break; case RXRPC_PACKET_TYPE_CHALLENGE: if (rxrpc_to_server(sp)) return just_discard; break; case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_to_client(sp)) return just_discard; break; /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: return just_discard; default: return rxrpc_bad_message(skb, rxrpc_badmsg_unsupported_packet); } if (sp->hdr.serviceId == 0) return rxrpc_bad_message(skb, rxrpc_badmsg_zero_service); if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) return just_discard; /* Unsupported address type. */ if (peer_srx.transport.family != local->srx.transport.family && (peer_srx.transport.family == AF_INET && local->srx.transport.family != AF_INET6)) { pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", peer_srx.transport.family, local->srx.transport.family); return just_discard; /* Wrong address type. */ } if (rxrpc_to_client(sp)) { rcu_read_lock(); conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); rcu_read_unlock(); if (!conn) return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_conn); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); return ret; } /* We need to look up service connections by the full protocol * parameter set. We look up the peer first as an intermediate step * and then the connection from the peer's tree. */ rcu_read_lock(); peer = rxrpc_lookup_peer_rcu(local, &peer_srx); if (!peer) { rcu_read_unlock(); return rxrpc_new_incoming_call(local, NULL, NULL, &peer_srx, skb); } conn = rxrpc_find_service_conn_rcu(peer, skb); conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); if (conn) { rcu_read_unlock(); ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); rxrpc_put_connection(conn, rxrpc_conn_put_call_input); return ret; } peer = rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input); rcu_read_unlock(); ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb); rxrpc_put_peer(peer, rxrpc_peer_put_input); return ret; } /* * Deal with a packet that's associated with an extant connection. */ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, RXKADINCONSISTENCY, -EBADMSG); if (sp->hdr.serviceId != conn->service_id) { int old_id; if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) return rxrpc_protocol_error(skb, rxrpc_eproto_reupgrade); old_id = cmpxchg(&conn->service_id, conn->orig_service_id, sp->hdr.serviceId); if (old_id != conn->orig_service_id && old_id != sp->hdr.serviceId) return rxrpc_protocol_error(skb, rxrpc_eproto_bad_upgrade); } if (after(sp->hdr.serial, conn->hi_serial)) conn->hi_serial = sp->hdr.serial; /* It's a connection-level packet if the call number is 0. */ if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) return just_discard; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) return just_discard; /* For the previous service call, if completed successfully, we * discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) return just_discard; /* But otherwise we need to retransmit the final packet from * data cached in the connection record. */ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) trace_rxrpc_rx_data(chan->call_debug_id, sp->hdr.seq, sp->hdr.serial, sp->hdr.flags); rxrpc_conn_retransmit_call(conn, skb, channel); return just_discard; } call = rxrpc_try_get_call(chan->call, rxrpc_call_get_input); if (sp->hdr.callNumber > chan->call_id) { if (rxrpc_to_client(sp)) { rxrpc_put_call(call, rxrpc_call_put_input); return rxrpc_protocol_error(skb, rxrpc_eproto_unexpected_implicit_end); } if (call) { rxrpc_implicit_end_call(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); call = NULL; } } if (!call) { if (rxrpc_to_client(sp)) return rxrpc_protocol_error(skb, rxrpc_eproto_no_client_call); return rxrpc_new_incoming_call(conn->local, conn->peer, conn, peer_srx, skb); } ret = rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return ret; } /* * I/O and event handling thread. */ int rxrpc_io_thread(void *data) { struct rxrpc_connection *conn; struct sk_buff_head rx_queue; struct rxrpc_local *local = data; struct rxrpc_call *call; struct sk_buff *skb; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY ktime_t now; #endif bool should_stop; complete(&local->io_thread_ready); skb_queue_head_init(&rx_queue); set_user_nice(current, MIN_NICE); for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); /* Deal with connections that want immediate attention. */ conn = list_first_entry_or_null(&local->conn_attend_q, struct rxrpc_connection, attend_link); if (conn) { spin_lock_bh(&local->lock); list_del_init(&conn->attend_link); spin_unlock_bh(&local->lock); rxrpc_input_conn_event(conn, NULL); rxrpc_put_connection(conn, rxrpc_conn_put_poke); continue; } if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) rxrpc_discard_expired_client_conns(local); /* Deal with calls that want immediate attention. */ if ((call = list_first_entry_or_null(&local->call_attend_q, struct rxrpc_call, attend_link))) { spin_lock_bh(&local->lock); list_del_init(&call->attend_link); spin_unlock_bh(&local->lock); trace_rxrpc_call_poked(call); rxrpc_input_call_event(call, NULL); rxrpc_put_call(call, rxrpc_call_put_poke); continue; } if (!list_empty(&local->new_client_calls)) rxrpc_connect_client_calls(local); /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; if (!rxrpc_input_packet(local, &skb)) rxrpc_reject_packet(local, skb); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); break; case RXRPC_SKB_MARK_ERROR: rxrpc_input_error(local, skb); rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: rxrpc_input_conn_event(sp->conn, skb); rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: WARN_ON_ONCE(1); rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } continue; } /* Inject a delay into packets if requested. */ #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY now = ktime_get_real(); while ((skb = skb_peek(&local->rx_delay_queue))) { if (ktime_before(now, skb->tstamp)) break; skb = skb_dequeue(&local->rx_delay_queue); skb_queue_tail(&local->rx_queue, skb); } #endif if (!skb_queue_empty(&local->rx_queue)) { spin_lock_irq(&local->rx_queue.lock); skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); spin_unlock_irq(&local->rx_queue.lock); continue; } set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || !list_empty(&local->call_attend_q) || !list_empty(&local->conn_attend_q) || !list_empty(&local->new_client_calls) || test_bit(RXRPC_CLIENT_CONN_REAP_TIMER, &local->client_conn_flags)) { __set_current_state(TASK_RUNNING); continue; } if (should_stop) break; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY skb = skb_peek(&local->rx_delay_queue); if (skb) { unsigned long timeout; ktime_t tstamp = skb->tstamp; ktime_t now = ktime_get_real(); s64 delay_ns = ktime_to_ns(ktime_sub(tstamp, now)); if (delay_ns <= 0) { __set_current_state(TASK_RUNNING); continue; } timeout = nsecs_to_jiffies(delay_ns); timeout = max(timeout, 1UL); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; } #endif schedule(); } __set_current_state(TASK_RUNNING); rxrpc_see_local(local, rxrpc_local_stop); rxrpc_destroy_local(local); WRITE_ONCE(local->io_thread, NULL); rxrpc_see_local(local, rxrpc_local_stopped); return 0; } |
3 12 12 12 12 482 481 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init); |
1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" static char *resp_state_name[] = { [RESPST_NONE] = "NONE", [RESPST_GET_REQ] = "GET_REQ", [RESPST_CHK_PSN] = "CHK_PSN", [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", [RESPST_CHK_LENGTH] = "CHK_LENGTH", [RESPST_CHK_RKEY] = "CHK_RKEY", [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", [RESPST_ERR_RNR] = "ERR_RNR", [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", [RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION", [RESPST_ERR_LENGTH] = "ERR_LENGTH", [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", [RESPST_ERROR] = "ERROR", [RESPST_DONE] = "DONE", [RESPST_EXIT] = "EXIT", }; /* rxe_recv calls here to add a request packet to the input queue */ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { skb_queue_tail(&qp->req_pkts, skb); rxe_sched_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, struct rxe_pkt_info **pkt_p) { struct sk_buff *skb; skb = skb_peek(&qp->req_pkts); if (!skb) return RESPST_EXIT; *pkt_p = SKB_TO_PKT(skb); return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; } static enum resp_states check_psn(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { int diff = psn_compare(pkt->psn, qp->resp.psn); struct rxe_dev *rxe = to_rdev(qp->ibqp.device); switch (qp_type(qp)) { case IB_QPT_RC: if (diff > 0) { if (qp->resp.sent_psn_nak) return RESPST_CLEANUP; qp->resp.sent_psn_nak = 1; rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ); return RESPST_ERR_PSN_OUT_OF_SEQ; } else if (diff < 0) { rxe_counter_inc(rxe, RXE_CNT_DUP_REQ); return RESPST_DUPLICATE_REQUEST; } if (qp->resp.sent_psn_nak) qp->resp.sent_psn_nak = 0; break; case IB_QPT_UC: if (qp->resp.drop_msg || diff != 0) { if (pkt->mask & RXE_START_MASK) { qp->resp.drop_msg = 0; return RESPST_CHK_OP_SEQ; } qp->resp.drop_msg = 1; return RESPST_CLEANUP; } break; default: break; } return RESPST_CHK_OP_SEQ; } static enum resp_states check_op_seq(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: switch (qp->resp.opcode) { case IB_OPCODE_RC_SEND_FIRST: case IB_OPCODE_RC_SEND_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_RC_SEND_MIDDLE: case IB_OPCODE_RC_SEND_LAST: case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_C; } case IB_OPCODE_RC_RDMA_WRITE_FIRST: case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: case IB_OPCODE_RC_RDMA_WRITE_LAST: case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_C; } default: switch (pkt->opcode) { case IB_OPCODE_RC_SEND_MIDDLE: case IB_OPCODE_RC_SEND_LAST: case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: case IB_OPCODE_RC_RDMA_WRITE_LAST: case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_ERR_MISSING_OPCODE_FIRST; default: return RESPST_CHK_OP_VALID; } } break; case IB_QPT_UC: switch (qp->resp.opcode) { case IB_OPCODE_UC_SEND_FIRST: case IB_OPCODE_UC_SEND_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_UC_SEND_MIDDLE: case IB_OPCODE_UC_SEND_LAST: case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_D1E; } case IB_OPCODE_UC_RDMA_WRITE_FIRST: case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: case IB_OPCODE_UC_RDMA_WRITE_LAST: case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_D1E; } default: switch (pkt->opcode) { case IB_OPCODE_UC_SEND_MIDDLE: case IB_OPCODE_UC_SEND_LAST: case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: case IB_OPCODE_UC_RDMA_WRITE_LAST: case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: qp->resp.drop_msg = 1; return RESPST_CLEANUP; default: return RESPST_CHK_OP_VALID; } } break; default: return RESPST_CHK_OP_VALID; } } static bool check_qp_attr_access(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { if (((pkt->mask & RXE_READ_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || ((pkt->mask & RXE_ATOMIC_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) return false; if (pkt->mask & RXE_FLUSH_MASK) { u32 flush_type = feth_plt(pkt); if ((flush_type & IB_FLUSH_GLOBAL && !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) || (flush_type & IB_FLUSH_PERSISTENT && !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT))) return false; } return true; } static enum resp_states check_op_valid(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: if (!check_qp_attr_access(qp, pkt)) return RESPST_ERR_UNSUPPORTED_OPCODE; break; case IB_QPT_UC: if ((pkt->mask & RXE_WRITE_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { qp->resp.drop_msg = 1; return RESPST_CLEANUP; } break; case IB_QPT_UD: case IB_QPT_GSI: break; default: WARN_ON_ONCE(1); break; } return RESPST_CHK_RESOURCE; } static enum resp_states get_srq_wqe(struct rxe_qp *qp) { struct rxe_srq *srq = qp->srq; struct rxe_queue *q = srq->rq.queue; struct rxe_recv_wqe *wqe; struct ib_event ev; unsigned int count; size_t size; unsigned long flags; if (srq->error) return RESPST_ERR_RNR; spin_lock_irqsave(&srq->rq.consumer_lock, flags); wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT); if (!wqe) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_ERR_RNR; } /* don't trust user space data */ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n"); return RESPST_ERR_MALFORMED_WQE; } size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge); memcpy(&qp->resp.srq_wqe, wqe, size); qp->resp.wqe = &qp->resp.srq_wqe.wqe; queue_advance_consumer(q, QUEUE_TYPE_FROM_CLIENT); count = queue_count(q, QUEUE_TYPE_FROM_CLIENT); if (srq->limit && srq->ibsrq.event_handler && (count < srq->limit)) { srq->limit = 0; goto event; } spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_CHK_LENGTH; event: spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); return RESPST_CHK_LENGTH; } static enum resp_states check_resource(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_srq *srq = qp->srq; if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { /* it is the requesters job to not send * too many read/atomic ops, we just * recycle the responder resource queue */ if (likely(qp->attr.max_dest_rd_atomic > 0)) return RESPST_CHK_LENGTH; else return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; } if (pkt->mask & RXE_RWR_MASK) { if (srq) return get_srq_wqe(qp); qp->resp.wqe = queue_head(qp->rq.queue, QUEUE_TYPE_FROM_CLIENT); return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; } return RESPST_CHK_LENGTH; } static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { unsigned int payload = payload_size(pkt); unsigned int recv_buffer_len = 0; int i; for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) recv_buffer_len += qp->resp.wqe->dma.sge[i].length; if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); return RESPST_ERR_LENGTH; } } if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; unsigned int payload = payload_size(pkt); if ((pkt->mask & RXE_START_MASK) && (pkt->mask & RXE_END_MASK)) { if (unlikely(payload > mtu)) { rxe_dbg_qp(qp, "only packet too long\n"); return RESPST_ERR_LENGTH; } } else if ((pkt->mask & RXE_START_MASK) || (pkt->mask & RXE_MIDDLE_MASK)) { if (unlikely(payload != mtu)) { rxe_dbg_qp(qp, "first or middle packet not mtu\n"); return RESPST_ERR_LENGTH; } } else if (pkt->mask & RXE_END_MASK) { if (unlikely((payload == 0) || (payload > mtu))) { rxe_dbg_qp(qp, "last packet zero or too long\n"); return RESPST_ERR_LENGTH; } } } /* See IBA C9-94 */ if (pkt->mask & RXE_RETH_MASK) { if (reth_len(pkt) > (1U << 31)) { rxe_dbg_qp(qp, "dma length too long\n"); return RESPST_ERR_LENGTH; } } if (pkt->mask & RXE_RDMA_OP_MASK) return RESPST_CHK_RKEY; else return RESPST_EXECUTE; } /* if the reth length field is zero we can assume nothing * about the rkey value and should not validate or use it. * Instead set qp->resp.rkey to 0 which is an invalid rkey * value since the minimum index part is 1. */ static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { unsigned int length = reth_len(pkt); qp->resp.va = reth_va(pkt); qp->resp.offset = 0; qp->resp.resid = length; qp->resp.length = length; if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0) qp->resp.rkey = 0; else qp->resp.rkey = reth_rkey(pkt); } static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->resp.va = atmeth_va(pkt); qp->resp.offset = 0; qp->resp.rkey = atmeth_rkey(pkt); qp->resp.resid = sizeof(u64); } /* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL * if an invalid rkey is received or the rdma length is zero. For middle * or last packets use the stored value of mr. */ static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_mr *mr = NULL; struct rxe_mw *mw = NULL; u64 va; u32 rkey; u32 resid; u32 pktlen; int mtu = qp->mtu; enum resp_states state; int access = 0; /* parse RETH or ATMETH header for first/only packets * for va, length, rkey, etc. or use current value for * middle/last packets. */ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; } else if (pkt->mask & RXE_FLUSH_MASK) { u32 flush_type = feth_plt(pkt); if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); if (flush_type & IB_FLUSH_GLOBAL) access |= IB_ACCESS_FLUSH_GLOBAL; if (flush_type & IB_FLUSH_PERSISTENT) access |= IB_ACCESS_FLUSH_PERSISTENT; } else if (pkt->mask & RXE_ATOMIC_MASK) { qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { /* shouldn't happen */ WARN_ON(1); } /* A zero-byte read or write op is not required to * set an addr or rkey. See C9-88 */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { qp->resp.mr = NULL; return RESPST_EXECUTE; } va = qp->resp.va; rkey = qp->resp.rkey; resid = qp->resp.resid; pktlen = payload_size(pkt); if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } mr = mw->mr; if (!mr) { rxe_dbg_qp(qp, "MW doesn't have an MR\n"); state = RESPST_ERR_RKEY_VIOLATION; goto err; } if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; rxe_get(mr); rxe_put(mw); mw = NULL; } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } } if (pkt->mask & RXE_FLUSH_MASK) { /* FLUSH MR may not set va or resid * no need to check range since we will flush whole mr */ if (feth_sel(pkt) == IB_FLUSH_MR) goto skip_check_range; } if (mr_check_range(mr, va + qp->resp.offset, resid)) { state = RESPST_ERR_RKEY_VIOLATION; goto err; } skip_check_range: if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; goto err; } } else { if (pktlen != resid) { state = RESPST_ERR_LENGTH; goto err; } if ((bth_pad(pkt) != (0x3 & (-resid)))) { /* This case may not be exactly that * but nothing else fits. */ state = RESPST_ERR_LENGTH; goto err; } } } WARN_ON_ONCE(qp->resp.mr); qp->resp.mr = mr; return RESPST_EXECUTE; err: qp->resp.mr = NULL; if (mr) rxe_put(mr); if (mw) rxe_put(mw); return state; } static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, int data_len) { int err; err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, data_addr, data_len, RXE_TO_MR_OBJ); if (unlikely(err)) return (err == -ENOSPC) ? RESPST_ERR_LENGTH : RESPST_ERR_MALFORMED_WQE; return RESPST_NONE; } static enum resp_states write_data_in(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc = RESPST_NONE; int err; int data_len = payload_size(pkt); err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, payload_addr(pkt), data_len, RXE_TO_MR_OBJ); if (err) { rc = RESPST_ERR_RKEY_VIOLATION; goto out; } qp->resp.va += data_len; qp->resp.resid -= data_len; out: return rc; } static struct resp_res *rxe_prepare_res(struct rxe_qp *qp, struct rxe_pkt_info *pkt, int type) { struct resp_res *res; u32 pkts; res = &qp->resp.resources[qp->resp.res_head]; rxe_advance_resp_resource(qp); free_rd_atomic_resource(res); res->type = type; res->replay = 0; switch (type) { case RXE_READ_MASK: res->read.va = qp->resp.va + qp->resp.offset; res->read.va_org = qp->resp.va + qp->resp.offset; res->read.resid = qp->resp.resid; res->read.length = qp->resp.resid; res->read.rkey = qp->resp.rkey; pkts = max_t(u32, (reth_len(pkt) + qp->mtu - 1)/qp->mtu, 1); res->first_psn = pkt->psn; res->cur_psn = pkt->psn; res->last_psn = (pkt->psn + pkts - 1) & BTH_PSN_MASK; res->state = rdatm_res_state_new; break; case RXE_ATOMIC_MASK: case RXE_ATOMIC_WRITE_MASK: res->first_psn = pkt->psn; res->last_psn = pkt->psn; res->cur_psn = pkt->psn; break; case RXE_FLUSH_MASK: res->flush.va = qp->resp.va + qp->resp.offset; res->flush.length = qp->resp.length; res->flush.type = feth_plt(pkt); res->flush.level = feth_sel(pkt); } return res; } static enum resp_states process_flush(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { u64 length, start; struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; /* oA19-14, oA19-15 */ if (res && res->replay) return RESPST_ACKNOWLEDGE; else if (!res) { res = rxe_prepare_res(qp, pkt, RXE_FLUSH_MASK); qp->resp.res = res; } if (res->flush.level == IB_FLUSH_RANGE) { start = res->flush.va; length = res->flush.length; } else { /* level == IB_FLUSH_MR */ start = mr->ibmr.iova; length = mr->ibmr.length; } if (res->flush.type & IB_FLUSH_PERSISTENT) { if (rxe_flush_pmem_iova(mr, start, length)) return RESPST_ERR_RKEY_VIOLATION; /* Make data persistent. */ wmb(); } else if (res->flush.type & IB_FLUSH_GLOBAL) { /* Make data global visibility. */ wmb(); } qp->resp.msn++; /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; return RESPST_ACKNOWLEDGE; } static enum resp_states atomic_reply(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; int err; if (!res) { res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); qp->resp.res = res; } if (!res->replay) { u64 iova = qp->resp.va + qp->resp.offset; err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, atmeth_comp(pkt), atmeth_swap_add(pkt), &res->atomic.orig_val); if (err) return err; qp->resp.msn++; /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; } return RESPST_ACKNOWLEDGE; } static enum resp_states atomic_write_reply(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct resp_res *res = qp->resp.res; struct rxe_mr *mr; u64 value; u64 iova; int err; if (!res) { res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); qp->resp.res = res; } if (res->replay) return RESPST_ACKNOWLEDGE; mr = qp->resp.mr; value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; qp->resp.resid = 0; qp->resp.msn++; /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; return RESPST_ACKNOWLEDGE; } static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, struct rxe_pkt_info *ack, int opcode, int payload, u32 psn, u8 syndrome) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct sk_buff *skb; int paylen; int pad; int err; /* * allocate packet */ pad = (-payload) & 0x3; paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; skb = rxe_init_packet(rxe, &qp->pri_av, paylen, ack); if (!skb) return NULL; ack->qp = qp; ack->opcode = opcode; ack->mask = rxe_opcode[opcode].mask; ack->paylen = paylen; ack->psn = psn; bth_init(ack, opcode, 0, 0, pad, IB_DEFAULT_PKEY_FULL, qp->attr.dest_qp_num, 0, psn); if (ack->mask & RXE_AETH_MASK) { aeth_set_syn(ack, syndrome); aeth_set_msn(ack, qp->resp.msn); } if (ack->mask & RXE_ATMACK_MASK) atmack_set_orig(ack, qp->resp.res->atomic.orig_val); err = rxe_prepare(&qp->pri_av, ack, skb); if (err) { kfree_skb(skb); return NULL; } return skb; } /** * rxe_recheck_mr - revalidate MR from rkey and get a reference * @qp: the qp * @rkey: the rkey * * This code allows the MR to be invalidated or deregistered or * the MW if one was used to be invalidated or deallocated. * It is assumed that the access permissions if originally good * are OK and the mappings to be unchanged. * * TODO: If someone reregisters an MR to change its size or * access permissions during the processing of an RDMA read * we should kill the responder resource and complete the * operation with an error. * * Return: mr on success else NULL */ static struct rxe_mr *rxe_recheck_mr(struct rxe_qp *qp, u32 rkey) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); struct rxe_mr *mr; struct rxe_mw *mw; if (rkey_is_mw(rkey)) { mw = rxe_pool_get_index(&rxe->mw_pool, rkey >> 8); if (!mw) return NULL; mr = mw->mr; if (mw->rkey != rkey || mw->state != RXE_MW_STATE_VALID || !mr || mr->state != RXE_MR_STATE_VALID) { rxe_put(mw); return NULL; } rxe_get(mr); rxe_put(mw); return mr; } mr = rxe_pool_get_index(&rxe->mr_pool, rkey >> 8); if (!mr) return NULL; if (mr->rkey != rkey || mr->state != RXE_MR_STATE_VALID) { rxe_put(mr); return NULL; } return mr; } /* RDMA read response. If res is not NULL, then we have a current RDMA request * being processed or replayed. */ static enum resp_states read_reply(struct rxe_qp *qp, struct rxe_pkt_info *req_pkt) { struct rxe_pkt_info ack_pkt; struct sk_buff *skb; int mtu = qp->mtu; enum resp_states state; int payload; int opcode; int err; struct resp_res *res = qp->resp.res; struct rxe_mr *mr; if (!res) { res = rxe_prepare_res(qp, req_pkt, RXE_READ_MASK); qp->resp.res = res; } if (res->state == rdatm_res_state_new) { if (!res->replay || qp->resp.length == 0) { /* if length == 0 mr will be NULL (is ok) * otherwise qp->resp.mr holds a ref on mr * which we transfer to mr and drop below. */ mr = qp->resp.mr; qp->resp.mr = NULL; } else { mr = rxe_recheck_mr(qp, res->read.rkey); if (!mr) return RESPST_ERR_RKEY_VIOLATION; } if (res->read.resid <= mtu) opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY; else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; } else { /* re-lookup mr from rkey on all later packets. * length will be non-zero. This can fail if someone * modifies or destroys the mr since the first packet. */ mr = rxe_recheck_mr(qp, res->read.rkey); if (!mr) return RESPST_ERR_RKEY_VIOLATION; if (res->read.resid > mtu) opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE; else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST; } res->state = rdatm_res_state_next; payload = min_t(int, res->read.resid, mtu); skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { state = RESPST_ERR_RNR; goto err_out; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); if (err) { kfree_skb(skb); state = RESPST_ERR_RKEY_VIOLATION; goto err_out; } if (bth_pad(&ack_pkt)) { u8 *pad = payload_addr(&ack_pkt) + payload; memset(pad, 0, bth_pad(&ack_pkt)); } /* rxe_xmit_packet always consumes the skb */ err = rxe_xmit_packet(qp, &ack_pkt, skb); if (err) { state = RESPST_ERR_RNR; goto err_out; } res->read.va += payload; res->read.resid -= payload; res->cur_psn = (res->cur_psn + 1) & BTH_PSN_MASK; if (res->read.resid > 0) { state = RESPST_DONE; } else { qp->resp.res = NULL; if (!res->replay) qp->resp.opcode = -1; if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) qp->resp.psn = res->cur_psn; state = RESPST_CLEANUP; } err_out: if (mr) rxe_put(mr); return state; } static int invalidate_rkey(struct rxe_qp *qp, u32 rkey) { if (rkey_is_mw(rkey)) return rxe_invalidate_mw(qp, rkey); else return rxe_invalidate_mr(qp, rkey); } /* Executes a new request. A retried request never reach that function (send * and writes are discarded, and reads and atomics are retried elsewhere. */ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states err; struct sk_buff *skb = PKT_TO_SKB(pkt); union rdma_network_hdr hdr; if (pkt->mask & RXE_SEND_MASK) { if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { if (skb->protocol == htons(ETH_P_IP)) { memset(&hdr.reserved, 0, sizeof(hdr.reserved)); memcpy(&hdr.roce4grh, ip_hdr(skb), sizeof(hdr.roce4grh)); err = send_data_in(qp, &hdr, sizeof(hdr)); } else { err = send_data_in(qp, ipv6_hdr(skb), sizeof(hdr)); } if (err) return err; } err = send_data_in(qp, payload_addr(pkt), payload_size(pkt)); if (err) return err; } else if (pkt->mask & RXE_WRITE_MASK) { err = write_data_in(qp, pkt); if (err) return err; } else if (pkt->mask & RXE_READ_MASK) { /* For RDMA Read we can increment the msn now. See C9-148. */ qp->resp.msn++; return RESPST_READ_REPLY; } else if (pkt->mask & RXE_ATOMIC_MASK) { return RESPST_ATOMIC_REPLY; } else if (pkt->mask & RXE_ATOMIC_WRITE_MASK) { return RESPST_ATOMIC_WRITE_REPLY; } else if (pkt->mask & RXE_FLUSH_MASK) { return RESPST_PROCESS_FLUSH; } else { /* Unreachable */ WARN_ON_ONCE(1); } if (pkt->mask & RXE_IETH_MASK) { u32 rkey = ieth_rkey(pkt); err = invalidate_rkey(qp, rkey); if (err) return RESPST_ERR_INVALIDATE_RKEY; } if (pkt->mask & RXE_END_MASK) /* We successfully processed this new request. */ qp->resp.msn++; /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; if (pkt->mask & RXE_COMP_MASK) return RESPST_COMPLETE; else if (qp_type(qp) == IB_QPT_RC) return RESPST_ACKNOWLEDGE; else return RESPST_CLEANUP; } static enum resp_states do_complete(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_cqe cqe; struct ib_wc *wc = &cqe.ibwc; struct ib_uverbs_wc *uwc = &cqe.uibwc; struct rxe_recv_wqe *wqe = qp->resp.wqe; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); unsigned long flags; if (!wqe) goto finish; memset(&cqe, 0, sizeof(cqe)); if (qp->rcq->is_user) { uwc->status = qp->resp.status; uwc->qp_num = qp->ibqp.qp_num; uwc->wr_id = wqe->wr_id; } else { wc->status = qp->resp.status; wc->qp = &qp->ibqp; wc->wr_id = wqe->wr_id; } if (wc->status == IB_WC_SUCCESS) { rxe_counter_inc(rxe, RXE_CNT_RDMA_RECV); wc->opcode = (pkt->mask & RXE_IMMDT_MASK && pkt->mask & RXE_WRITE_MASK) ? IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && pkt->mask & RXE_WRITE_MASK) ? qp->resp.length : wqe->dma.length - wqe->dma.resid; /* fields after byte_len are different between kernel and user * space */ if (qp->rcq->is_user) { uwc->wc_flags = IB_WC_GRH; if (pkt->mask & RXE_IMMDT_MASK) { uwc->wc_flags |= IB_WC_WITH_IMM; uwc->ex.imm_data = immdt_imm(pkt); } if (pkt->mask & RXE_IETH_MASK) { uwc->wc_flags |= IB_WC_WITH_INVALIDATE; uwc->ex.invalidate_rkey = ieth_rkey(pkt); } if (pkt->mask & RXE_DETH_MASK) uwc->src_qp = deth_sqp(pkt); uwc->port_num = qp->attr.port_num; } else { struct sk_buff *skb = PKT_TO_SKB(pkt); wc->wc_flags = IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE; if (skb->protocol == htons(ETH_P_IP)) wc->network_hdr_type = RDMA_NETWORK_IPV4; else wc->network_hdr_type = RDMA_NETWORK_IPV6; if (is_vlan_dev(skb->dev)) { wc->wc_flags |= IB_WC_WITH_VLAN; wc->vlan_id = vlan_dev_vlan_id(skb->dev); } if (pkt->mask & RXE_IMMDT_MASK) { wc->wc_flags |= IB_WC_WITH_IMM; wc->ex.imm_data = immdt_imm(pkt); } if (pkt->mask & RXE_IETH_MASK) { wc->wc_flags |= IB_WC_WITH_INVALIDATE; wc->ex.invalidate_rkey = ieth_rkey(pkt); } if (pkt->mask & RXE_DETH_MASK) wc->src_qp = deth_sqp(pkt); wc->port_num = qp->attr.port_num; } } else { if (wc->status != IB_WC_WR_FLUSH_ERR) rxe_err_qp(qp, "non-flush error status = %d\n", wc->status); } /* have copy for srq and reference for !srq */ if (!qp->srq) queue_advance_consumer(qp->rq.queue, QUEUE_TYPE_FROM_CLIENT); qp->resp.wqe = NULL; if (rxe_cq_post(qp->rcq, &cqe, pkt ? bth_se(pkt) : 1)) return RESPST_ERR_CQ_OVERFLOW; finish: spin_lock_irqsave(&qp->state_lock, flags); if (unlikely(qp_state(qp) == IB_QPS_ERR)) { spin_unlock_irqrestore(&qp->state_lock, flags); return RESPST_CHK_RESOURCE; } spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(!pkt)) return RESPST_DONE; if (qp_type(qp) == IB_QPT_RC) return RESPST_ACKNOWLEDGE; else return RESPST_CLEANUP; } static int send_common_ack(struct rxe_qp *qp, u8 syndrome, u32 psn, int opcode, const char *msg) { int err; struct rxe_pkt_info ack_pkt; struct sk_buff *skb; skb = prepare_ack_packet(qp, &ack_pkt, opcode, 0, psn, syndrome); if (!skb) return -ENOMEM; err = rxe_xmit_packet(qp, &ack_pkt, skb); if (err) rxe_dbg_qp(qp, "Failed sending %s\n", msg); return err; } static int send_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { return send_common_ack(qp, syndrome, psn, IB_OPCODE_RC_ACKNOWLEDGE, "ACK"); } static int send_atomic_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { int ret = send_common_ack(qp, syndrome, psn, IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE, "ATOMIC ACK"); /* have to clear this since it is used to trigger * long read replies */ qp->resp.res = NULL; return ret; } static int send_read_response_ack(struct rxe_qp *qp, u8 syndrome, u32 psn) { int ret = send_common_ack(qp, syndrome, psn, IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY, "RDMA READ response of length zero ACK"); /* have to clear this since it is used to trigger * long read replies */ qp->resp.res = NULL; return ret; } static enum resp_states acknowledge(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { if (qp_type(qp) != IB_QPT_RC) return RESPST_CLEANUP; if (qp->resp.aeth_syndrome != AETH_ACK_UNLIMITED) send_ack(qp, qp->resp.aeth_syndrome, pkt->psn); else if (pkt->mask & RXE_ATOMIC_MASK) send_atomic_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (pkt->mask & (RXE_FLUSH_MASK | RXE_ATOMIC_WRITE_MASK)) send_read_response_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); else if (bth_ack(pkt)) send_ack(qp, AETH_ACK_UNLIMITED, pkt->psn); return RESPST_CLEANUP; } static enum resp_states cleanup(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct sk_buff *skb; if (pkt) { skb = skb_dequeue(&qp->req_pkts); rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } if (qp->resp.mr) { rxe_put(qp->resp.mr); qp->resp.mr = NULL; } return RESPST_DONE; } static struct resp_res *find_resource(struct rxe_qp *qp, u32 psn) { int i; for (i = 0; i < qp->attr.max_dest_rd_atomic; i++) { struct resp_res *res = &qp->resp.resources[i]; if (res->type == 0) continue; if (psn_compare(psn, res->first_psn) >= 0 && psn_compare(psn, res->last_psn) <= 0) { return res; } } return NULL; } static enum resp_states duplicate_request(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc; u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { /* SEND. Ack again and cleanup. C9-105. */ send_ack(qp, AETH_ACK_UNLIMITED, prev_psn); return RESPST_CLEANUP; } else if (pkt->mask & RXE_FLUSH_MASK) { struct resp_res *res; /* Find the operation in our list of responder resources. */ res = find_resource(qp, pkt->psn); if (res) { res->replay = 1; res->cur_psn = pkt->psn; qp->resp.res = res; rc = RESPST_PROCESS_FLUSH; goto out; } /* Resource not found. Class D error. Drop the request. */ rc = RESPST_CLEANUP; goto out; } else if (pkt->mask & RXE_READ_MASK) { struct resp_res *res; res = find_resource(qp, pkt->psn); if (!res) { /* Resource not found. Class D error. Drop the * request. */ rc = RESPST_CLEANUP; goto out; } else { /* Ensure this new request is the same as the previous * one or a subset of it. */ u64 iova = reth_va(pkt); u32 resid = reth_len(pkt); if (iova < res->read.va_org || resid > res->read.length || (iova + resid) > (res->read.va_org + res->read.length)) { rc = RESPST_CLEANUP; goto out; } if (reth_rkey(pkt) != res->read.rkey) { rc = RESPST_CLEANUP; goto out; } res->cur_psn = pkt->psn; res->state = (pkt->psn == res->first_psn) ? rdatm_res_state_new : rdatm_res_state_replay; res->replay = 1; /* Reset the resource, except length. */ res->read.va_org = iova; res->read.va = iova; res->read.resid = resid; /* Replay the RDMA read reply. */ qp->resp.res = res; rc = RESPST_READ_REPLY; goto out; } } else { struct resp_res *res; /* Find the operation in our list of responder resources. */ res = find_resource(qp, pkt->psn); if (res) { res->replay = 1; res->cur_psn = pkt->psn; qp->resp.res = res; rc = pkt->mask & RXE_ATOMIC_MASK ? RESPST_ATOMIC_REPLY : RESPST_ATOMIC_WRITE_REPLY; goto out; } /* Resource not found. Class D error. Drop the request. */ rc = RESPST_CLEANUP; goto out; } out: return rc; } /* Process a class A or C. Both are treated the same in this implementation. */ static void do_class_ac_error(struct rxe_qp *qp, u8 syndrome, enum ib_wc_status status) { qp->resp.aeth_syndrome = syndrome; qp->resp.status = status; /* indicate that we should go through the ERROR state */ qp->resp.goto_error = 1; } static enum resp_states do_class_d1e_error(struct rxe_qp *qp) { /* UC */ if (qp->srq) { /* Class E */ qp->resp.drop_msg = 1; if (qp->resp.wqe) { qp->resp.status = IB_WC_REM_INV_REQ_ERR; return RESPST_COMPLETE; } else { return RESPST_CLEANUP; } } else { /* Class D1. This packet may be the start of a * new message and could be valid. The previous * message is invalid and ignored. reset the * recv wr to its original state */ if (qp->resp.wqe) { qp->resp.wqe->dma.resid = qp->resp.wqe->dma.length; qp->resp.wqe->dma.cur_sge = 0; qp->resp.wqe->dma.sge_offset = 0; qp->resp.opcode = -1; } if (qp->resp.mr) { rxe_put(qp->resp.mr); qp->resp.mr = NULL; } return RESPST_CLEANUP; } } /* drain incoming request packet queue */ static void drain_req_pkts(struct rxe_qp *qp) { struct sk_buff *skb; while ((skb = skb_dequeue(&qp->req_pkts))) { rxe_put(qp); kfree_skb(skb); ib_device_put(qp->ibqp.device); } } /* complete receive wqe with flush error */ static int flush_recv_wqe(struct rxe_qp *qp, struct rxe_recv_wqe *wqe) { struct rxe_cqe cqe = {}; struct ib_wc *wc = &cqe.ibwc; struct ib_uverbs_wc *uwc = &cqe.uibwc; int err; if (qp->rcq->is_user) { uwc->wr_id = wqe->wr_id; uwc->status = IB_WC_WR_FLUSH_ERR; uwc->qp_num = qp_num(qp); } else { wc->wr_id = wqe->wr_id; wc->status = IB_WC_WR_FLUSH_ERR; wc->qp = &qp->ibqp; } err = rxe_cq_post(qp->rcq, &cqe, 0); if (err) rxe_dbg_cq(qp->rcq, "post cq failed err = %d\n", err); return err; } /* drain and optionally complete the recive queue * if unable to complete a wqe stop completing and * just flush the remaining wqes */ static void flush_recv_queue(struct rxe_qp *qp, bool notify) { struct rxe_queue *q = qp->rq.queue; struct rxe_recv_wqe *wqe; int err; if (qp->srq) { if (notify && qp->ibqp.event_handler) { struct ib_event ev; ev.device = qp->ibqp.device; ev.element.qp = &qp->ibqp; ev.event = IB_EVENT_QP_LAST_WQE_REACHED; qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); } return; } /* recv queue not created. nothing to do. */ if (!qp->rq.queue) return; while ((wqe = queue_head(q, q->type))) { if (notify) { err = flush_recv_wqe(qp, wqe); if (err) notify = 0; } queue_advance_consumer(q, q->type); } qp->resp.wqe = NULL; } int rxe_receiver(struct rxe_qp *qp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; struct rxe_pkt_info *pkt = NULL; int ret; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { bool notify = qp->valid && (qp_state(qp) == IB_QPS_ERR); drain_req_pkts(qp); flush_recv_queue(qp, notify); spin_unlock_irqrestore(&qp->state_lock, flags); goto exit; } spin_unlock_irqrestore(&qp->state_lock, flags); qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; state = RESPST_GET_REQ; while (1) { rxe_dbg_qp(qp, "state = %s\n", resp_state_name[state]); switch (state) { case RESPST_GET_REQ: state = get_req(qp, &pkt); break; case RESPST_CHK_PSN: state = check_psn(qp, pkt); break; case RESPST_CHK_OP_SEQ: state = check_op_seq(qp, pkt); break; case RESPST_CHK_OP_VALID: state = check_op_valid(qp, pkt); break; case RESPST_CHK_RESOURCE: state = check_resource(qp, pkt); break; case RESPST_CHK_LENGTH: state = rxe_resp_check_length(qp, pkt); break; case RESPST_CHK_RKEY: state = check_rkey(qp, pkt); break; case RESPST_EXECUTE: state = execute(qp, pkt); break; case RESPST_COMPLETE: state = do_complete(qp, pkt); break; case RESPST_READ_REPLY: state = read_reply(qp, pkt); break; case RESPST_ATOMIC_REPLY: state = atomic_reply(qp, pkt); break; case RESPST_ATOMIC_WRITE_REPLY: state = atomic_write_reply(qp, pkt); break; case RESPST_PROCESS_FLUSH: state = process_flush(qp, pkt); break; case RESPST_ACKNOWLEDGE: state = acknowledge(qp, pkt); break; case RESPST_CLEANUP: state = cleanup(qp, pkt); break; case RESPST_DUPLICATE_REQUEST: state = duplicate_request(qp, pkt); break; case RESPST_ERR_PSN_OUT_OF_SEQ: /* RC only - Class B. Drop packet. */ send_ack(qp, AETH_NAK_PSN_SEQ_ERROR, qp->resp.psn); state = RESPST_CLEANUP; break; case RESPST_ERR_TOO_MANY_RDMA_ATM_REQ: case RESPST_ERR_MISSING_OPCODE_FIRST: case RESPST_ERR_MISSING_OPCODE_LAST_C: case RESPST_ERR_UNSUPPORTED_OPCODE: case RESPST_ERR_MISALIGNED_ATOMIC: /* RC Only - Class C. */ do_class_ac_error(qp, AETH_NAK_INVALID_REQ, IB_WC_REM_INV_REQ_ERR); state = RESPST_COMPLETE; break; case RESPST_ERR_MISSING_OPCODE_LAST_D1E: state = do_class_d1e_error(qp); break; case RESPST_ERR_RNR: if (qp_type(qp) == IB_QPT_RC) { rxe_counter_inc(rxe, RXE_CNT_SND_RNR); /* RC - class B */ send_ack(qp, AETH_RNR_NAK | (~AETH_TYPE_MASK & qp->attr.min_rnr_timer), pkt->psn); } else { /* UD/UC - class D */ qp->resp.drop_msg = 1; } state = RESPST_CLEANUP; break; case RESPST_ERR_RKEY_VIOLATION: if (qp_type(qp) == IB_QPT_RC) { /* Class C */ do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR, IB_WC_REM_ACCESS_ERR); state = RESPST_COMPLETE; } else { qp->resp.drop_msg = 1; if (qp->srq) { /* UC/SRQ Class D */ qp->resp.status = IB_WC_REM_ACCESS_ERR; state = RESPST_COMPLETE; } else { /* UC/non-SRQ Class E. */ state = RESPST_CLEANUP; } } break; case RESPST_ERR_INVALIDATE_RKEY: /* RC - Class J. */ qp->resp.goto_error = 1; qp->resp.status = IB_WC_REM_INV_REQ_ERR; state = RESPST_COMPLETE; break; case RESPST_ERR_LENGTH: if (qp_type(qp) == IB_QPT_RC) { /* Class C */ do_class_ac_error(qp, AETH_NAK_INVALID_REQ, IB_WC_REM_INV_REQ_ERR); state = RESPST_COMPLETE; } else if (qp->srq) { /* UC/UD - class E */ qp->resp.status = IB_WC_REM_INV_REQ_ERR; state = RESPST_COMPLETE; } else { /* UC/UD - class D */ qp->resp.drop_msg = 1; state = RESPST_CLEANUP; } break; case RESPST_ERR_MALFORMED_WQE: /* All, Class A. */ do_class_ac_error(qp, AETH_NAK_REM_OP_ERR, IB_WC_LOC_QP_OP_ERR); state = RESPST_COMPLETE; break; case RESPST_ERR_CQ_OVERFLOW: /* All - Class G */ state = RESPST_ERROR; break; case RESPST_DONE: if (qp->resp.goto_error) { state = RESPST_ERROR; break; } goto done; case RESPST_EXIT: if (qp->resp.goto_error) { state = RESPST_ERROR; break; } goto exit; case RESPST_ERROR: qp->resp.goto_error = 0; rxe_dbg_qp(qp, "moved to error state\n"); rxe_qp_error(qp); goto exit; default: WARN_ON_ONCE(1); } } /* A non-zero return value will cause rxe_do_task to * exit its loop and end the work item. A zero return * will continue looping and return to rxe_responder */ done: ret = 0; goto out; exit: ret = -EAGAIN; out: return ret; } |
1 11 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _RDS_RDS_H #define _RDS_RDS_H #include <net/sock.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <rdma/rdma_cm.h> #include <linux/mutex.h> #include <linux/rds.h> #include <linux/rhashtable.h> #include <linux/refcount.h> #include <linux/in6.h> #include "info.h" /* * RDS Network protocol version */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 #define RDS_PROTOCOL_4_0 0x0400 #define RDS_PROTOCOL_4_1 0x0401 #define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1 /* The following ports, 16385, 18634, 18635, are registered with IANA as * the ports to be used for RDS over TCP and UDP. Currently, only RDS over * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept * to ensure compatibility with older RDS modules. Those ports are defined * in each transport's header file. */ #define RDS_PORT 18634 #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif #ifdef RDS_DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else /* sigh, pr_debug() causes unused variable warnings */ static inline __printf(1, 2) void rdsdebug(char *fmt, ...) { } #endif #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) /* Used to limit both RDMA and non-RDMA RDS message to 1MB */ #define RDS_MAX_MSG_SIZE ((unsigned int)(1 << 20)) #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) struct rds_cong_map { struct rb_node m_rb_node; struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; }; /* * This is how we will track the connection state: * A connection is always in one of the following * states. Updates to the state are atomic and imply * a memory barrier. */ enum { RDS_CONN_DOWN = 0, RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, RDS_CONN_RESETTING, RDS_CONN_ERROR, }; /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 #define RDS_DESTROY_PENDING 4 /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; struct rds_message *cp_xmit_rm; unsigned long cp_xmit_sg; unsigned int cp_xmit_hdr_off; unsigned int cp_xmit_data_off; unsigned int cp_xmit_atomic_sent; unsigned int cp_xmit_rdma_sent; unsigned int cp_xmit_data_sent; spinlock_t cp_lock; /* protect msg queues */ u64 cp_next_tx_seq; struct list_head cp_send_queue; struct list_head cp_retrans; u64 cp_next_rx_seq; void *cp_transport_data; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; unsigned long cp_reconnect_jiffies; struct delayed_work cp_send_w; struct delayed_work cp_recv_w; struct delayed_work cp_conn_w; struct work_struct cp_down_w; struct mutex cp_cm_lock; /* protect cp_state & cm */ wait_queue_head_t cp_waitq; unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; unsigned int cp_index; }; /* One rds_connection per RDS address pair */ struct rds_connection { struct hlist_node c_hash_node; struct in6_addr c_laddr; struct in6_addr c_faddr; int c_dev_if; /* ifindex used for this conn */ int c_bound_if; /* ifindex of c_laddr */ unsigned int c_loopback:1, c_isv6:1, c_ping_triggered:1, c_pad_to_32:29; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; possible_net_t c_net; /* TOS */ u8 c_tos; struct list_head c_map_item; unsigned long c_map_queued; struct rds_conn_path *c_path; wait_queue_head_t c_hs_waitq; /* handshake waitq */ u32 c_my_gen_num; u32 c_peer_gen_num; }; static inline struct net *rds_conn_net(struct rds_connection *conn) { return read_pnet(&conn->c_net); } static inline void rds_conn_net_set(struct rds_connection *conn, struct net *net) { write_pnet(&conn->c_net, net); } #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 #define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. * Currently the control information that is exchanged is the number of * supported paths. If the peer is a legacy (older kernel revision) peer, * it would return a pong message without additional control information * that would then alert the sender that the peer was an older rev. */ #define RDS_FLAG_PROBE_PORT 1 #define RDS_HS_PROBE(sport, dport) \ ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \ (sport == 0 && dport == RDS_FLAG_PROBE_PORT)) /* * Maximum space available for extension headers. */ #define RDS_HEADER_EXT_SPACE 16 struct rds_header { __be64 h_sequence; __be64 h_ack; __be32 h_len; __be16 h_sport; __be16 h_dport; u8 h_flags; u8 h_credit; u8 h_padding[4]; __sum16 h_csum; u8 h_exthdr[RDS_HEADER_EXT_SPACE]; }; /* * Reserved - indicates end of extensions */ #define RDS_EXTHDR_NONE 0 /* * This extension header is included in the very * first message that is sent on a new connection, * and identifies the protocol level. This will help * rolling updates if a future change requires breaking * the protocol. * NB: This is no longer true for IB, where we do a version * negotiation during the connection setup phase (protocol * version information is included in the RDMA CM private data). */ #define RDS_EXTHDR_VERSION 1 struct rds_ext_header_version { __be32 h_version; }; /* * This extension header is included in the RDS message * chasing an RDMA operation. */ #define RDS_EXTHDR_RDMA 2 struct rds_ext_header_rdma { __be32 h_rdma_rkey; }; /* * This extension header tells the peer about the * destination <R_Key,offset> of the requested RDMA * operation. */ #define RDS_EXTHDR_RDMA_DEST 3 struct rds_ext_header_rdma_dest { __be32 h_rdma_rkey; __be32 h_rdma_offset; }; /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 #define __RDS_EXTHDR_MAX 16 /* for now */ #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 #define RDS_MSG_RX_END 2 #define RDS_MSG_RX_CMSG 3 /* The following values are whitelisted for usercopy */ struct rds_inc_usercopy { rds_rdma_cookie_t rdma_cookie; ktime_t rx_tstamp; }; struct rds_incoming { refcount_t i_refcount; struct list_head i_item; struct rds_connection *i_conn; struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; struct in6_addr i_saddr; struct rds_inc_usercopy i_usercopy; u64 i_rx_lat_trace[RDS_RX_MAX_TRACES]; }; struct rds_mr { struct rb_node r_rb_node; struct kref r_kref; u32 r_key; /* A copy of the creation flags */ unsigned int r_use_once:1; unsigned int r_invalidate:1; unsigned int r_write:1; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); } static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) { return cookie; } static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) { return cookie >> 32; } /* atomic operation types */ #define RDS_ATOMIC_TYPE_CSWP 0 #define RDS_ATOMIC_TYPE_FADD 1 /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty * the message will not be put back on the retransmit list after being sent. * messages that are canceled while being sent rely on this. * * m_inc is used by loopback so that it can pass an incoming message straight * back up into the rx path. It embeds a wire header which is also used by * the send path, which is kind of awkward. * * m_sock_item indicates the message's presence on a socket's send or receive * queue. m_rs will point to that socket. * * m_daddr is used by cancellation to prune messages to a given destination. * * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock * nesting. As paths iterate over messages on a sock, or conn, they must * also lock the conn, or sock, to remove the message from those lists too. * Testing the flag to determine if the message is still on the lists lets * us avoid testing the list_head directly. That means each path can use * the message's list_head to keep it on a local list while juggling locks * without confusing the other path. * * m_ack_seq is an optional field set by transports who need a different * sequence number range to invalidate. They can use this in a callback * that they pass to rds_send_drop_acked() to see if each message has been * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't * had ack_seq set yet. */ #define RDS_MSG_ON_SOCK 1 #define RDS_MSG_ON_CONN 2 #define RDS_MSG_HAS_ACK_SEQ 3 #define RDS_MSG_ACK_REQUIRED 4 #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 #define RDS_MSG_FLUSH 8 struct rds_znotifier { struct mmpin z_mmp; u32 z_cookie; }; struct rds_msg_zcopy_info { struct list_head rs_zcookie_next; union { struct rds_znotifier znotif; struct rds_zcopy_cookies zcookies; }; }; struct rds_msg_zcopy_queue { struct list_head zcookie_head; spinlock_t lock; /* protects zcookie_head queue */ }; static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q) { spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->zcookie_head); } struct rds_iov_vector { struct rds_iovec *iov; int len; }; struct rds_iov_vector_arr { struct rds_iov_vector *vec; int len; int indx; int incr; }; struct rds_message { refcount_t m_refcount; struct list_head m_sock_item; struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. * Lock nesting is * rm->m_rs_lock * -> rs->rs_lock */ spinlock_t m_rs_lock; wait_queue_head_t m_flush_wait; struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; unsigned int m_used_sgs; unsigned int m_total_sgs; void *m_final_op; struct { struct rm_atomic_op { int op_type; union { struct { uint64_t compare; uint64_t swap; uint64_t compare_mask; uint64_t swap_mask; } op_m_cswp; struct { uint64_t add; uint64_t nocarry_mask; } op_m_fadd; }; u32 op_rkey; u64 op_remote_addr; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { u32 op_rkey; u64 op_remote_addr; unsigned int op_write:1; unsigned int op_fence:1; unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; u64 op_odp_addr; struct rds_mr *op_odp_mr; } rdma; struct rm_data_op { unsigned int op_active:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; unsigned int op_dmaoff; struct rds_znotifier *op_mmp_znotifier; struct scatterlist *op_sg; } data; }; struct rds_conn_path *m_conn_path; }; /* * The RDS notifier is used (optionally) to tell the application about * completed RDMA operations. Rather than keeping the whole rds message * around on the queue, we allocate a small notifier that is put on the * socket's notifier_list. Notifications are delivered to the application * through control messages. */ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; }; /* Available as part of RDS core, so doesn't need to participate * in get_preferred transport etc */ #define RDS_TRANS_LOOP 3 /** * struct rds_transport - transport specific behavioural hooks * * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send * part of a message. The caller serializes on the send_sem so this * doesn't need to be reentrant for a given conn. The header must be * sent before the data payload. .xmit must be prepared to send a * message with no data payload. .xmit should return the number of * bytes that were sent down the connection, including header bytes. * Returning 0 tells the caller that it doesn't need to perform any * additional work now. This is usually the case when the transport has * filled the sending queue for its connection and will handle * triggering the rds thread to continue the send when space becomes * available. Returning -EAGAIN tells the caller to retry the send * immediately. Returning -ENOMEM tells the caller to retry the send at * some point in the future. * * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once * it returns the connection can not call rds_recv_incoming(). * This will only be called once after conn_connect returns * non-zero success and will The caller serializes this with * the send and connecting paths (xmit_* and conn_*). The * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. */ struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; struct module *t_owner; unsigned int t_prefer_loopback:1, t_mp_capable:1; unsigned int t_type; int (*laddr_check)(struct net *net, const struct in6_addr *addr, __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv_path)(struct rds_conn_path *cp); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, struct rdma_cm_event *event, bool isv6); int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); unsigned int (*stats_info_copy)(struct rds_info_iterator *iter, unsigned int avail); void (*exit)(void); void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, struct rds_sock *rs, u32 *key_ret, struct rds_connection *conn, u64 start, u64 length, int need_odp); void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); bool (*t_unloading)(struct rds_connection *conn); u8 (*get_tos_map)(u8 tos); }; /* Bind hash table key length. It is the sum of the size of a struct * in6_addr, a scope_id and a port. */ #define RDS_BOUND_KEY_LEN \ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) struct rds_sock { struct sock rs_sk; u64 rs_user_addr; u64 rs_user_bytes; /* * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ struct rhash_head rs_bound_node; u8 rs_bound_key[RDS_BOUND_KEY_LEN]; struct sockaddr_in6 rs_bound_sin6; #define rs_bound_addr rs_bound_sin6.sin6_addr #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] #define rs_bound_port rs_bound_sin6.sin6_port #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id struct in6_addr rs_conn_addr; #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; /* * rds_sendmsg caches the conn it used the last time around. * This helps avoid costly lookups. */ struct rds_connection *rs_conn; /* flag indicating we were congested or not */ int rs_congested; /* seen congestion (ENOBUFS) when sending? */ int rs_seen_congestion; /* rs_lock protects all these adjacent members before the newline */ spinlock_t rs_lock; struct list_head rs_send_queue; u32 rs_snd_bytes; int rs_rcv_bytes; struct list_head rs_notify_queue; /* currently used for failed RDMAs */ /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask * to decide whether the application should be woken up. * If not set, we use rs_cong_track to find out whether a cong map * update arrived. */ uint64_t rs_cong_mask; uint64_t rs_cong_notify; struct list_head rs_cong_list; unsigned long rs_cong_track; /* * rs_recv_lock protects the receive queue, and is * used to serialize with rds_release. */ rwlock_t rs_recv_lock; struct list_head rs_recv_queue; /* just for stats reporting */ struct list_head rs_item; /* these have their own lock */ spinlock_t rs_rdma_lock; struct rb_root rs_rdma_keys; /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; u32 rs_hash_initval; /* Socket receive path trace points*/ u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) { return container_of(sk, struct rds_sock, rs_sk); } static inline struct sock *rds_rs_to_sk(struct rds_sock *rs) { return &rs->rs_sk; } /* * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value * to account for overhead. We don't account for overhead, we just apply * the number of payload bytes to the specified value. */ static inline int rds_sk_sndbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_sndbuf / 2; } static inline int rds_sk_rcvbuf(struct rds_sock *rs) { return rds_rs_to_sk(rs)->sk_rcvbuf / 2; } struct rds_statistics { uint64_t s_conn_reset; uint64_t s_recv_drop_bad_checksum; uint64_t s_recv_drop_old_seq; uint64_t s_recv_drop_no_sock; uint64_t s_recv_drop_dead_sock; uint64_t s_recv_deliver_raced; uint64_t s_recv_delivered; uint64_t s_recv_queued; uint64_t s_recv_immediate_retry; uint64_t s_recv_delayed_retry; uint64_t s_recv_ack_required; uint64_t s_recv_rdma_bytes; uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; uint64_t s_send_lock_contention; uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; uint64_t s_send_ack_required; uint64_t s_send_queued; uint64_t s_send_rdma; uint64_t s_send_rdma_bytes; uint64_t s_send_pong; uint64_t s_page_remainder_hit; uint64_t s_page_remainder_miss; uint64_t s_copy_to_user; uint64_t s_copy_from_user; uint64_t s_cong_update_queued; uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; uint64_t s_recv_bytes_added_to_socket; uint64_t s_recv_bytes_removed_from_socket; uint64_t s_send_stuck_rm; }; /* af_rds.c */ void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); static inline void __rds_wake_sk_sleep(struct sock *sk) { wait_queue_head_t *waitq = sk_sleep(sk); if (!sock_flag(sk, SOCK_DEAD) && waitq) wake_up(waitq); } extern wait_queue_head_t rds_poll_waitq; /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); int rds_bind_lock_init(void); void rds_bind_lock_destroy(void); /* cong.c */ int rds_cong_get_maps(struct rds_connection *conn); void rds_cong_add_conn(struct rds_connection *conn); void rds_cong_remove_conn(struct rds_connection *conn); void rds_cong_set_bit(struct rds_cong_map *map, __be16 port); void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port); int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs); void rds_cong_queue_updates(struct rds_cong_map *map); void rds_cong_map_updated(struct rds_cong_map *map, uint64_t); int rds_cong_updated_since(unsigned long *recent); void rds_cong_add_socket(struct rds_sock *); void rds_cong_remove_socket(struct rds_sock *); void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* connection.c */ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *cp); void rds_check_all_paths(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, int (*visitor)(struct rds_connection *, void *), u64 *buffer, size_t item_len); __printf(2, 3) void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...); #define rds_conn_path_error(cp, fmt...) \ __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt) static inline int rds_conn_path_transition(struct rds_conn_path *cp, int old, int new) { return atomic_cmpxchg(&cp->cp_state, old, new) == old; } static inline int rds_conn_transition(struct rds_connection *conn, int old, int new) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_transition(&conn->c_path[0], old, new); } static inline int rds_conn_path_state(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state); } static inline int rds_conn_state(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_state(&conn->c_path[0]); } static inline int rds_conn_path_up(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_UP; } static inline int rds_conn_path_down(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_DOWN; } static inline int rds_conn_up(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_up(&conn->c_path[0]); } static inline int rds_conn_path_connecting(struct rds_conn_path *cp) { return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING; } static inline int rds_conn_connecting(struct rds_connection *conn) { WARN_ON(conn->c_trans->t_mp_capable); return rds_conn_path_connecting(&conn->c_path[0]); } /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from, bool zcopy); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); void rds_message_wait(struct rds_message *rm); void rds_message_unmapped(struct rds_message *rm); void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info); static inline void rds_message_make_checksum(struct rds_header *hdr) { hdr->h_csum = 0; hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2); } static inline int rds_message_verify_checksum(const struct rds_header *hdr) { return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0; } /* page.c */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp); void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, struct in6_addr *saddr); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); void rds_clear_recv_queue(struct rds_sock *rs); int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); void rds6_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, struct in6_addr *saddr, struct in6_addr *daddr, int flip); /* send.c */ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *conn); int rds_send_xmit(struct rds_conn_path *cp); struct sockaddr_in; void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack, is_acked_func is_acked); void rds_send_ping(struct rds_connection *conn, int cp_index); int rds_send_pong(struct rds_conn_path *cp, __be16 dport); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen); int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen); void rds_rdma_drop_keys(struct rds_sock *rs); int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov); int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void __rds_put_mr_final(struct kref *kref); static inline bool rds_destroy_pending(struct rds_connection *conn) { return !check_net(rds_conn_net(conn)) || (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); } enum { ODP_NOT_NEEDED, ODP_ZEROBASED, ODP_VIRTUAL }; /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); #define rds_stats_inc_which(which, member) do { \ per_cpu(which, get_cpu()).member++; \ put_cpu(); \ } while (0) #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member) #define rds_stats_add_which(which, member, count) do { \ per_cpu(which, get_cpu()).member += count; \ put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; extern unsigned long rds_sysctl_sndbuf_max; extern unsigned long rds_sysctl_reconnect_min_jiffies; extern unsigned long rds_sysctl_reconnect_max_jiffies; extern unsigned int rds_sysctl_max_unacked_packets; extern unsigned int rds_sysctl_max_unacked_bytes; extern unsigned int rds_sysctl_ping_enable; extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; void rds_queue_reconnect(struct rds_conn_path *cp); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *conn, int curr); void rds_connect_complete(struct rds_connection *conn); int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ void rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); struct rds_transport *rds_trans_get(int t_type); #endif |
1 1 3 2 2 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 | // SPDX-License-Identifier: GPL-2.0 /* * ESSIV skcipher and aead template for block encryption * * This template encapsulates the ESSIV IV generation algorithm used by * dm-crypt and fscrypt, which converts the initial vector for the skcipher * used for block encryption, by encrypting it using the hash of the * skcipher key as encryption key. Usually, the input IV is a 64-bit sector * number in LE representation zero-padded to the size of the IV, but this * is not assumed by this driver. * * The typical use of this template is to instantiate the skcipher * 'essiv(cbc(aes),sha256)', which is the only instantiation used by * fscrypt, and the most relevant one for dm-crypt. However, dm-crypt * also permits ESSIV to be used in combination with the authenc template, * e.g., 'essiv(authenc(hmac(sha256),cbc(aes)),sha256)', in which case * we need to instantiate an aead that accepts the same special key format * as the authenc template, and deals with the way the encrypted IV is * embedded into the AAD area of the aead request. This means the AEAD * flavor produced by this template is tightly coupled to the way dm-crypt * happens to use it. * * Copyright (c) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org> * * Heavily based on: * adiantum length-preserving encryption mode * * Copyright 2018 Google LLC */ #include <crypto/authenc.h> #include <crypto/internal/aead.h> #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> #include "internal.h" struct essiv_instance_ctx { union { struct crypto_skcipher_spawn skcipher_spawn; struct crypto_aead_spawn aead_spawn; } u; char essiv_cipher_name[CRYPTO_MAX_ALG_NAME]; char shash_driver_name[CRYPTO_MAX_ALG_NAME]; }; struct essiv_tfm_ctx { union { struct crypto_skcipher *skcipher; struct crypto_aead *aead; } u; struct crypto_cipher *essiv_cipher; struct crypto_shash *hash; int ivoffset; }; struct essiv_aead_request_ctx { struct scatterlist sg[4]; u8 *assoc; struct aead_request aead_req; }; static int essiv_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); u8 salt[HASH_MAX_DIGESTSIZE]; int err; crypto_skcipher_clear_flags(tctx->u.skcipher, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(tctx->u.skcipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(tctx->u.skcipher, key, keylen); if (err) return err; err = crypto_shash_tfm_digest(tctx->hash, key, keylen, salt); if (err) return err; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(tctx->essiv_cipher, salt, crypto_shash_digestsize(tctx->hash)); } static int essiv_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen) { struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); SHASH_DESC_ON_STACK(desc, tctx->hash); struct crypto_authenc_keys keys; u8 salt[HASH_MAX_DIGESTSIZE]; int err; crypto_aead_clear_flags(tctx->u.aead, CRYPTO_TFM_REQ_MASK); crypto_aead_set_flags(tctx->u.aead, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); err = crypto_aead_setkey(tctx->u.aead, key, keylen); if (err) return err; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) return -EINVAL; desc->tfm = tctx->hash; err = crypto_shash_init(desc) ?: crypto_shash_update(desc, keys.enckey, keys.enckeylen) ?: crypto_shash_finup(desc, keys.authkey, keys.authkeylen, salt); if (err) return err; crypto_cipher_clear_flags(tctx->essiv_cipher, CRYPTO_TFM_REQ_MASK); crypto_cipher_set_flags(tctx->essiv_cipher, crypto_aead_get_flags(tfm) & CRYPTO_TFM_REQ_MASK); return crypto_cipher_setkey(tctx->essiv_cipher, salt, crypto_shash_digestsize(tctx->hash)); } static int essiv_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); return crypto_aead_setauthsize(tctx->u.aead, authsize); } static void essiv_skcipher_done(void *data, int err) { struct skcipher_request *req = data; skcipher_request_complete(req, err); } static int essiv_skcipher_crypt(struct skcipher_request *req, bool enc) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); struct skcipher_request *subreq = skcipher_request_ctx(req); crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv); skcipher_request_set_tfm(subreq, tctx->u.skcipher); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, req->iv); skcipher_request_set_callback(subreq, skcipher_request_flags(req), essiv_skcipher_done, req); return enc ? crypto_skcipher_encrypt(subreq) : crypto_skcipher_decrypt(subreq); } static int essiv_skcipher_encrypt(struct skcipher_request *req) { return essiv_skcipher_crypt(req, true); } static int essiv_skcipher_decrypt(struct skcipher_request *req) { return essiv_skcipher_crypt(req, false); } static void essiv_aead_done(void *data, int err) { struct aead_request *req = data; struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); if (err == -EINPROGRESS) goto out; kfree(rctx->assoc); out: aead_request_complete(req, err); } static int essiv_aead_crypt(struct aead_request *req, bool enc) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); const struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); struct essiv_aead_request_ctx *rctx = aead_request_ctx(req); struct aead_request *subreq = &rctx->aead_req; struct scatterlist *src = req->src; int err; crypto_cipher_encrypt_one(tctx->essiv_cipher, req->iv, req->iv); /* * dm-crypt embeds the sector number and the IV in the AAD region, so * we have to copy the converted IV into the right scatterlist before * we pass it on. */ rctx->assoc = NULL; if (req->src == req->dst || !enc) { scatterwalk_map_and_copy(req->iv, req->dst, req->assoclen - crypto_aead_ivsize(tfm), crypto_aead_ivsize(tfm), 1); } else { u8 *iv = (u8 *)aead_request_ctx(req) + tctx->ivoffset; int ivsize = crypto_aead_ivsize(tfm); int ssize = req->assoclen - ivsize; struct scatterlist *sg; int nents; if (ssize < 0) return -EINVAL; nents = sg_nents_for_len(req->src, ssize); if (nents < 0) return -EINVAL; memcpy(iv, req->iv, ivsize); sg_init_table(rctx->sg, 4); if (unlikely(nents > 1)) { /* * This is a case that rarely occurs in practice, but * for correctness, we have to deal with it nonetheless. */ rctx->assoc = kmalloc(ssize, GFP_ATOMIC); if (!rctx->assoc) return -ENOMEM; scatterwalk_map_and_copy(rctx->assoc, req->src, 0, ssize, 0); sg_set_buf(rctx->sg, rctx->assoc, ssize); } else { sg_set_page(rctx->sg, sg_page(req->src), ssize, req->src->offset); } sg_set_buf(rctx->sg + 1, iv, ivsize); sg = scatterwalk_ffwd(rctx->sg + 2, req->src, req->assoclen); if (sg != rctx->sg + 2) sg_chain(rctx->sg, 3, sg); src = rctx->sg; } aead_request_set_tfm(subreq, tctx->u.aead); aead_request_set_ad(subreq, req->assoclen); aead_request_set_callback(subreq, aead_request_flags(req), essiv_aead_done, req); aead_request_set_crypt(subreq, src, req->dst, req->cryptlen, req->iv); err = enc ? crypto_aead_encrypt(subreq) : crypto_aead_decrypt(subreq); if (rctx->assoc && err != -EINPROGRESS && err != -EBUSY) kfree(rctx->assoc); return err; } static int essiv_aead_encrypt(struct aead_request *req) { return essiv_aead_crypt(req, true); } static int essiv_aead_decrypt(struct aead_request *req) { return essiv_aead_crypt(req, false); } static int essiv_init_tfm(struct essiv_instance_ctx *ictx, struct essiv_tfm_ctx *tctx) { struct crypto_cipher *essiv_cipher; struct crypto_shash *hash; int err; essiv_cipher = crypto_alloc_cipher(ictx->essiv_cipher_name, 0, 0); if (IS_ERR(essiv_cipher)) return PTR_ERR(essiv_cipher); hash = crypto_alloc_shash(ictx->shash_driver_name, 0, 0); if (IS_ERR(hash)) { err = PTR_ERR(hash); goto err_free_essiv_cipher; } tctx->essiv_cipher = essiv_cipher; tctx->hash = hash; return 0; err_free_essiv_cipher: crypto_free_cipher(essiv_cipher); return err; } static int essiv_skcipher_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst); struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *skcipher; int err; skcipher = crypto_spawn_skcipher(&ictx->u.skcipher_spawn); if (IS_ERR(skcipher)) return PTR_ERR(skcipher); crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + crypto_skcipher_reqsize(skcipher)); err = essiv_init_tfm(ictx, tctx); if (err) { crypto_free_skcipher(skcipher); return err; } tctx->u.skcipher = skcipher; return 0; } static int essiv_aead_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct essiv_instance_ctx *ictx = aead_instance_ctx(inst); struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); struct crypto_aead *aead; unsigned int subreq_size; int err; BUILD_BUG_ON(offsetofend(struct essiv_aead_request_ctx, aead_req) != sizeof(struct essiv_aead_request_ctx)); aead = crypto_spawn_aead(&ictx->u.aead_spawn); if (IS_ERR(aead)) return PTR_ERR(aead); subreq_size = sizeof_field(struct essiv_aead_request_ctx, aead_req) + crypto_aead_reqsize(aead); tctx->ivoffset = offsetof(struct essiv_aead_request_ctx, aead_req) + subreq_size; crypto_aead_set_reqsize(tfm, tctx->ivoffset + crypto_aead_ivsize(aead)); err = essiv_init_tfm(ictx, tctx); if (err) { crypto_free_aead(aead); return err; } tctx->u.aead = aead; return 0; } static void essiv_skcipher_exit_tfm(struct crypto_skcipher *tfm) { struct essiv_tfm_ctx *tctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(tctx->u.skcipher); crypto_free_cipher(tctx->essiv_cipher); crypto_free_shash(tctx->hash); } static void essiv_aead_exit_tfm(struct crypto_aead *tfm) { struct essiv_tfm_ctx *tctx = crypto_aead_ctx(tfm); crypto_free_aead(tctx->u.aead); crypto_free_cipher(tctx->essiv_cipher); crypto_free_shash(tctx->hash); } static void essiv_skcipher_free_instance(struct skcipher_instance *inst) { struct essiv_instance_ctx *ictx = skcipher_instance_ctx(inst); crypto_drop_skcipher(&ictx->u.skcipher_spawn); kfree(inst); } static void essiv_aead_free_instance(struct aead_instance *inst) { struct essiv_instance_ctx *ictx = aead_instance_ctx(inst); crypto_drop_aead(&ictx->u.aead_spawn); kfree(inst); } static bool parse_cipher_name(char *essiv_cipher_name, const char *cra_name) { const char *p, *q; int len; /* find the last opening parens */ p = strrchr(cra_name, '('); if (!p++) return false; /* find the first closing parens in the tail of the string */ q = strchr(p, ')'); if (!q) return false; len = q - p; if (len >= CRYPTO_MAX_ALG_NAME) return false; memcpy(essiv_cipher_name, p, len); essiv_cipher_name[len] = '\0'; return true; } static bool essiv_supported_algorithms(const char *essiv_cipher_name, struct shash_alg *hash_alg, int ivsize) { struct crypto_alg *alg; bool ret = false; alg = crypto_alg_mod_lookup(essiv_cipher_name, CRYPTO_ALG_TYPE_CIPHER, CRYPTO_ALG_TYPE_MASK); if (IS_ERR(alg)) return false; if (hash_alg->digestsize < alg->cra_cipher.cia_min_keysize || hash_alg->digestsize > alg->cra_cipher.cia_max_keysize) goto out; if (ivsize != alg->cra_blocksize) goto out; if (crypto_shash_alg_needs_key(hash_alg)) goto out; ret = true; out: crypto_mod_put(alg); return ret; } static int essiv_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_alg_common *skcipher_alg = NULL; struct crypto_attr_type *algt; const char *inner_cipher_name; const char *shash_name; struct skcipher_instance *skcipher_inst = NULL; struct aead_instance *aead_inst = NULL; struct crypto_instance *inst; struct crypto_alg *base, *block_base; struct essiv_instance_ctx *ictx; struct aead_alg *aead_alg = NULL; struct crypto_alg *_hash_alg; struct shash_alg *hash_alg; int ivsize; u32 type; u32 mask; int err; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); inner_cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(inner_cipher_name)) return PTR_ERR(inner_cipher_name); shash_name = crypto_attr_alg_name(tb[2]); if (IS_ERR(shash_name)) return PTR_ERR(shash_name); type = algt->type & algt->mask; mask = crypto_algt_inherited_mask(algt); switch (type) { case CRYPTO_ALG_TYPE_LSKCIPHER: skcipher_inst = kzalloc(sizeof(*skcipher_inst) + sizeof(*ictx), GFP_KERNEL); if (!skcipher_inst) return -ENOMEM; inst = skcipher_crypto_instance(skcipher_inst); base = &skcipher_inst->alg.base; ictx = crypto_instance_ctx(inst); /* Symmetric cipher, e.g., "cbc(aes)" */ err = crypto_grab_skcipher(&ictx->u.skcipher_spawn, inst, inner_cipher_name, 0, mask); if (err) goto out_free_inst; skcipher_alg = crypto_spawn_skcipher_alg_common( &ictx->u.skcipher_spawn); block_base = &skcipher_alg->base; ivsize = skcipher_alg->ivsize; break; case CRYPTO_ALG_TYPE_AEAD: aead_inst = kzalloc(sizeof(*aead_inst) + sizeof(*ictx), GFP_KERNEL); if (!aead_inst) return -ENOMEM; inst = aead_crypto_instance(aead_inst); base = &aead_inst->alg.base; ictx = crypto_instance_ctx(inst); /* AEAD cipher, e.g., "authenc(hmac(sha256),cbc(aes))" */ err = crypto_grab_aead(&ictx->u.aead_spawn, inst, inner_cipher_name, 0, mask); if (err) goto out_free_inst; aead_alg = crypto_spawn_aead_alg(&ictx->u.aead_spawn); block_base = &aead_alg->base; if (!strstarts(block_base->cra_name, "authenc(")) { pr_warn("Only authenc() type AEADs are supported by ESSIV\n"); err = -EINVAL; goto out_drop_skcipher; } ivsize = aead_alg->ivsize; break; default: return -EINVAL; } if (!parse_cipher_name(ictx->essiv_cipher_name, block_base->cra_name)) { pr_warn("Failed to parse ESSIV cipher name from skcipher cra_name\n"); err = -EINVAL; goto out_drop_skcipher; } /* Synchronous hash, e.g., "sha256" */ _hash_alg = crypto_alg_mod_lookup(shash_name, CRYPTO_ALG_TYPE_SHASH, CRYPTO_ALG_TYPE_MASK | mask); if (IS_ERR(_hash_alg)) { err = PTR_ERR(_hash_alg); goto out_drop_skcipher; } hash_alg = __crypto_shash_alg(_hash_alg); /* Check the set of algorithms */ if (!essiv_supported_algorithms(ictx->essiv_cipher_name, hash_alg, ivsize)) { pr_warn("Unsupported essiv instantiation: essiv(%s,%s)\n", block_base->cra_name, hash_alg->base.cra_name); err = -EINVAL; goto out_free_hash; } /* record the driver name so we can instantiate this exact algo later */ strscpy(ictx->shash_driver_name, hash_alg->base.cra_driver_name, CRYPTO_MAX_ALG_NAME); /* Instance fields */ err = -ENAMETOOLONG; if (snprintf(base->cra_name, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)", block_base->cra_name, hash_alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto out_free_hash; if (snprintf(base->cra_driver_name, CRYPTO_MAX_ALG_NAME, "essiv(%s,%s)", block_base->cra_driver_name, hash_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto out_free_hash; /* * hash_alg wasn't gotten via crypto_grab*(), so we need to inherit its * flags manually. */ base->cra_flags |= (hash_alg->base.cra_flags & CRYPTO_ALG_INHERITED_FLAGS); base->cra_blocksize = block_base->cra_blocksize; base->cra_ctxsize = sizeof(struct essiv_tfm_ctx); base->cra_alignmask = block_base->cra_alignmask; base->cra_priority = block_base->cra_priority; if (type == CRYPTO_ALG_TYPE_LSKCIPHER) { skcipher_inst->alg.setkey = essiv_skcipher_setkey; skcipher_inst->alg.encrypt = essiv_skcipher_encrypt; skcipher_inst->alg.decrypt = essiv_skcipher_decrypt; skcipher_inst->alg.init = essiv_skcipher_init_tfm; skcipher_inst->alg.exit = essiv_skcipher_exit_tfm; skcipher_inst->alg.min_keysize = skcipher_alg->min_keysize; skcipher_inst->alg.max_keysize = skcipher_alg->max_keysize; skcipher_inst->alg.ivsize = ivsize; skcipher_inst->alg.chunksize = skcipher_alg->chunksize; skcipher_inst->free = essiv_skcipher_free_instance; err = skcipher_register_instance(tmpl, skcipher_inst); } else { aead_inst->alg.setkey = essiv_aead_setkey; aead_inst->alg.setauthsize = essiv_aead_setauthsize; aead_inst->alg.encrypt = essiv_aead_encrypt; aead_inst->alg.decrypt = essiv_aead_decrypt; aead_inst->alg.init = essiv_aead_init_tfm; aead_inst->alg.exit = essiv_aead_exit_tfm; aead_inst->alg.ivsize = ivsize; aead_inst->alg.maxauthsize = crypto_aead_alg_maxauthsize(aead_alg); aead_inst->alg.chunksize = crypto_aead_alg_chunksize(aead_alg); aead_inst->free = essiv_aead_free_instance; err = aead_register_instance(tmpl, aead_inst); } if (err) goto out_free_hash; crypto_mod_put(_hash_alg); return 0; out_free_hash: crypto_mod_put(_hash_alg); out_drop_skcipher: if (type == CRYPTO_ALG_TYPE_LSKCIPHER) crypto_drop_skcipher(&ictx->u.skcipher_spawn); else crypto_drop_aead(&ictx->u.aead_spawn); out_free_inst: kfree(skcipher_inst); kfree(aead_inst); return err; } /* essiv(cipher_name, shash_name) */ static struct crypto_template essiv_tmpl = { .name = "essiv", .create = essiv_create, .module = THIS_MODULE, }; static int __init essiv_module_init(void) { return crypto_register_template(&essiv_tmpl); } static void __exit essiv_module_exit(void) { crypto_unregister_template(&essiv_tmpl); } subsys_initcall(essiv_module_init); module_exit(essiv_module_exit); MODULE_DESCRIPTION("ESSIV skcipher/aead wrapper for block encryption"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("essiv"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); |
13 10 16 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 | #ifndef LLC_PDU_H #define LLC_PDU_H /* * Copyright (c) 1997 by Procom Technology,Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/if_ether.h> /* Lengths of frame formats */ #define LLC_PDU_LEN_I 4 /* header and 2 control bytes */ #define LLC_PDU_LEN_S 4 #define LLC_PDU_LEN_U 3 /* header and 1 control byte */ /* header and 1 control byte and XID info */ #define LLC_PDU_LEN_U_XID (LLC_PDU_LEN_U + sizeof(struct llc_xid_info)) /* Known SAP addresses */ #define LLC_GLOBAL_SAP 0xFF #define LLC_NULL_SAP 0x00 /* not network-layer visible */ #define LLC_MGMT_INDIV 0x02 /* station LLC mgmt indiv addr */ #define LLC_MGMT_GRP 0x03 /* station LLC mgmt group addr */ #define LLC_RDE_SAP 0xA6 /* route ... */ /* SAP field bit masks */ #define LLC_ISO_RESERVED_SAP 0x02 #define LLC_SAP_GROUP_DSAP 0x01 #define LLC_SAP_RESP_SSAP 0x01 /* Group/individual DSAP indicator is DSAP field */ #define LLC_PDU_GROUP_DSAP_MASK 0x01 #define LLC_PDU_IS_GROUP_DSAP(pdu) \ ((pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) #define LLC_PDU_IS_INDIV_DSAP(pdu) \ (!(pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1) /* Command/response PDU indicator in SSAP field */ #define LLC_PDU_CMD_RSP_MASK 0x01 #define LLC_PDU_CMD 0 #define LLC_PDU_RSP 1 #define LLC_PDU_IS_CMD(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 0 : 1) #define LLC_PDU_IS_RSP(pdu) ((pdu->ssap & LLC_PDU_RSP) ? 1 : 0) /* Get PDU type from 2 lowest-order bits of control field first byte */ #define LLC_PDU_TYPE_I_MASK 0x01 /* 16-bit control field */ #define LLC_PDU_TYPE_S_MASK 0x03 #define LLC_PDU_TYPE_U_MASK 0x03 /* 8-bit control field */ #define LLC_PDU_TYPE_MASK 0x03 #define LLC_PDU_TYPE_I 0 /* first bit */ #define LLC_PDU_TYPE_S 1 /* first two bits */ #define LLC_PDU_TYPE_U 3 /* first two bits */ #define LLC_PDU_TYPE_U_XID 4 /* private type for detecting XID commands */ #define LLC_PDU_TYPE_IS_I(pdu) \ ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0) #define LLC_PDU_TYPE_IS_U(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_U_MASK) == LLC_PDU_TYPE_U) ? 1 : 0) #define LLC_PDU_TYPE_IS_S(pdu) \ (((pdu->ctrl_1 & LLC_PDU_TYPE_S_MASK) == LLC_PDU_TYPE_S) ? 1 : 0) /* U-format PDU control field masks */ #define LLC_U_PF_BIT_MASK 0x10 /* P/F bit mask */ #define LLC_U_PF_IS_1(pdu) ((pdu->ctrl_1 & LLC_U_PF_BIT_MASK) ? 1 : 0) #define LLC_U_PF_IS_0(pdu) ((!(pdu->ctrl_1 & LLC_U_PF_BIT_MASK)) ? 1 : 0) #define LLC_U_PDU_CMD_MASK 0xEC /* cmd/rsp mask */ #define LLC_U_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_U_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK) #define LLC_1_PDU_CMD_UI 0x00 /* Type 1 cmds/rsps */ #define LLC_1_PDU_CMD_XID 0xAC #define LLC_1_PDU_CMD_TEST 0xE0 #define LLC_2_PDU_CMD_SABME 0x6C /* Type 2 cmds/rsps */ #define LLC_2_PDU_CMD_DISC 0x40 #define LLC_2_PDU_RSP_UA 0x60 #define LLC_2_PDU_RSP_DM 0x0C #define LLC_2_PDU_RSP_FRMR 0x84 /* Type 1 operations */ /* XID information field bit masks */ /* LLC format identifier (byte 1) */ #define LLC_XID_FMT_ID 0x81 /* first byte must be this */ /* LLC types/classes identifier (byte 2) */ #define LLC_XID_CLASS_ZEROS_MASK 0xE0 /* these must be zeros */ #define LLC_XID_CLASS_MASK 0x1F /* AND with byte to get below */ #define LLC_XID_NULL_CLASS_1 0x01 /* if NULL LSAP...use these */ #define LLC_XID_NULL_CLASS_2 0x03 #define LLC_XID_NULL_CLASS_3 0x05 #define LLC_XID_NULL_CLASS_4 0x07 #define LLC_XID_NNULL_TYPE_1 0x01 /* if non-NULL LSAP...use these */ #define LLC_XID_NNULL_TYPE_2 0x02 #define LLC_XID_NNULL_TYPE_3 0x04 #define LLC_XID_NNULL_TYPE_1_2 0x03 #define LLC_XID_NNULL_TYPE_1_3 0x05 #define LLC_XID_NNULL_TYPE_2_3 0x06 #define LLC_XID_NNULL_ALL 0x07 /* Sender Receive Window (byte 3) */ #define LLC_XID_RW_MASK 0xFE /* AND with value to get below */ #define LLC_XID_MIN_RW 0x02 /* lowest-order bit always zero */ /* Type 2 operations */ #define LLC_2_SEQ_NBR_MODULO ((u8) 128) /* I-PDU masks ('ctrl' is I-PDU control word) */ #define LLC_I_GET_NS(pdu) (u8)((pdu->ctrl_1 & 0xFE) >> 1) #define LLC_I_GET_NR(pdu) (u8)((pdu->ctrl_2 & 0xFE) >> 1) #define LLC_I_PF_BIT_MASK 0x01 #define LLC_I_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_I_PF_BIT_MASK)) ? 1 : 0) #define LLC_I_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_I_PF_BIT_MASK) ? 1 : 0) /* S-PDU supervisory commands and responses */ #define LLC_S_PDU_CMD_MASK 0x0C #define LLC_S_PDU_CMD(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_S_PDU_RSP(pdu) (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK) #define LLC_2_PDU_CMD_RR 0x00 /* rx ready cmd */ #define LLC_2_PDU_RSP_RR 0x00 /* rx ready rsp */ #define LLC_2_PDU_CMD_REJ 0x08 /* reject PDU cmd */ #define LLC_2_PDU_RSP_REJ 0x08 /* reject PDU rsp */ #define LLC_2_PDU_CMD_RNR 0x04 /* rx not ready cmd */ #define LLC_2_PDU_RSP_RNR 0x04 /* rx not ready rsp */ #define LLC_S_PF_BIT_MASK 0x01 #define LLC_S_PF_IS_0(pdu) ((!(pdu->ctrl_2 & LLC_S_PF_BIT_MASK)) ? 1 : 0) #define LLC_S_PF_IS_1(pdu) ((pdu->ctrl_2 & LLC_S_PF_BIT_MASK) ? 1 : 0) #define PDU_SUPV_GET_Nr(pdu) ((pdu->ctrl_2 & 0xFE) >> 1) #define PDU_GET_NEXT_Vr(sn) (((sn) + 1) & ~LLC_2_SEQ_NBR_MODULO) /* FRMR information field macros */ #define FRMR_INFO_LENGTH 5 /* 5 bytes of information */ /* * info is pointer to FRMR info field structure; 'rej_ctrl' is byte pointer * (if U-PDU) or word pointer to rejected PDU control field */ #define FRMR_INFO_SET_REJ_CNTRL(info,rej_ctrl) \ info->rej_pdu_ctrl = ((*((u8 *) rej_ctrl) & \ LLC_PDU_TYPE_U) != LLC_PDU_TYPE_U ? \ (u16)*((u16 *) rej_ctrl) : \ (((u16) *((u8 *) rej_ctrl)) & 0x00FF)) /* * Info is pointer to FRMR info field structure; 'vs' is a byte containing * send state variable value in low-order 7 bits (insure the lowest-order * bit remains zero (0)) */ #define FRMR_INFO_SET_Vs(info,vs) (info->curr_ssv = (((u8) vs) << 1)) #define FRMR_INFO_SET_Vr(info,vr) (info->curr_rsv = (((u8) vr) << 1)) /* * Info is pointer to FRMR info field structure; 'cr' is a byte containing * the C/R bit value in the low-order bit */ #define FRMR_INFO_SET_C_R_BIT(info, cr) (info->curr_rsv |= (((u8) cr) & 0x01)) /* * In the remaining five macros, 'info' is pointer to FRMR info field * structure; 'ind' is a byte containing the bit value to set in the * lowest-order bit) */ #define FRMR_INFO_SET_INVALID_PDU_CTRL_IND(info, ind) \ (info->ind_bits = ((info->ind_bits & 0xFE) | (((u8) ind) & 0x01))) #define FRMR_INFO_SET_INVALID_PDU_INFO_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFD) | (((u8) ind) & 0x02))) #define FRMR_INFO_SET_PDU_INFO_2LONG_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xFB) | (((u8) ind) & 0x04))) #define FRMR_INFO_SET_PDU_INVALID_Nr_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xF7) | (((u8) ind) & 0x08))) #define FRMR_INFO_SET_PDU_INVALID_Ns_IND(info, ind) \ (info->ind_bits = ( (info->ind_bits & 0xEF) | (((u8) ind) & 0x10))) /* Sequence-numbered PDU format (4 bytes in length) */ struct llc_pdu_sn { u8 dsap; u8 ssap; u8 ctrl_1; u8 ctrl_2; } __packed; static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb) { return (struct llc_pdu_sn *)skb_network_header(skb); } /* Un-numbered PDU format (3 bytes in length) */ struct llc_pdu_un { u8 dsap; u8 ssap; u8 ctrl_1; } __packed; static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb) { return (struct llc_pdu_un *)skb_network_header(skb); } /** * llc_pdu_header_init - initializes pdu header * @skb: input skb that header must be set into it. * @type: type of PDU (U, I or S). * @ssap: source sap. * @dsap: destination sap. * @cr: command/response bit (0 or 1). * * This function sets DSAP, SSAP and command/Response bit in LLC header. */ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, u8 ssap, u8 dsap, u8 cr) { int hlen = 4; /* default value for I and S types */ struct llc_pdu_un *pdu; switch (type) { case LLC_PDU_TYPE_U: hlen = 3; break; case LLC_PDU_TYPE_U_XID: hlen = 6; break; } skb_push(skb, hlen); skb_reset_network_header(skb); pdu = llc_pdu_un_hdr(skb); pdu->dsap = dsap; pdu->ssap = ssap; pdu->ssap |= cr; } /** * llc_pdu_decode_sa - extracts, source address (MAC) of input frame * @skb: input skb that source address must be extracted from it. * @sa: pointer to source address (6 byte array). * * This function extracts source address(MAC) of input frame. */ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) { memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); } /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) { memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); } /** * llc_pdu_decode_ssap - extracts source SAP of input frame * @skb: input skb that source SAP must be extracted from it. * @ssap: source SAP (output argument). * * This function extracts source SAP of input frame. Right bit of SSAP is * command/response bit. */ static inline void llc_pdu_decode_ssap(struct sk_buff *skb, u8 *ssap) { *ssap = llc_pdu_un_hdr(skb)->ssap & 0xFE; } /** * llc_pdu_decode_dsap - extracts dest SAP of input frame * @skb: input skb that destination SAP must be extracted from it. * @dsap: destination SAP (output argument). * * This function extracts destination SAP of input frame. right bit of * DSAP designates individual/group SAP. */ static inline void llc_pdu_decode_dsap(struct sk_buff *skb, u8 *dsap) { *dsap = llc_pdu_un_hdr(skb)->dsap & 0xFE; } /** * llc_pdu_init_as_ui_cmd - sets LLC header as UI PDU * @skb: input skb that header must be set into it. * * This function sets third byte of LLC header as a UI PDU. */ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_UI; } /** * llc_pdu_init_as_test_cmd - sets PDU as TEST * @skb: Address of the skb to build * * Sets a PDU as TEST */ static inline void llc_pdu_init_as_test_cmd(struct sk_buff *skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_test_rsp - build TEST response PDU * @skb: Address of the skb to build * @ev_skb: The received TEST command PDU frame * * Builds a pdu frame as a TEST response. */ static inline void llc_pdu_init_as_test_rsp(struct sk_buff *skb, struct sk_buff *ev_skb) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; if (ev_skb->protocol == htons(ETH_P_802_2)) { struct llc_pdu_un *ev_pdu = llc_pdu_un_hdr(ev_skb); int dsize; dsize = ntohs(eth_hdr(ev_skb)->h_proto) - 3; memcpy(((u8 *)pdu) + 3, ((u8 *)ev_pdu) + 3, dsize); skb_put(skb, dsize); } } /* LLC Type 1 XID command/response information fields format */ struct llc_xid_info { u8 fmt_id; /* always 0x81 for LLC */ u8 type; /* different if NULL/non-NULL LSAP */ u8 rw; /* sender receive window */ } __packed; /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. */ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; /* 0x81 */ xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; /* size of receive window */ /* no need to push/put since llc_pdu_header_init() has already * pushed 3 + 3 bytes */ } /** * llc_pdu_init_as_xid_rsp - builds XID response PDU * @skb: Address of the skb to build * @svcs_supported: The class of the LLC (I or II) * @rx_window: The size of the receive window of the LLC * * Builds a pdu frame as an XID response. */ static inline void llc_pdu_init_as_xid_rsp(struct sk_buff *skb, u8 svcs_supported, u8 rx_window) { struct llc_xid_info *xid_info; struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_1_PDU_CMD_XID; pdu->ctrl_1 |= LLC_U_PF_BIT_MASK; xid_info = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1); xid_info->fmt_id = LLC_XID_FMT_ID; xid_info->type = svcs_supported; xid_info->rw = rx_window << 1; skb_put(skb, sizeof(struct llc_xid_info)); } /* LLC Type 2 FRMR response information field format */ struct llc_frmr_info { u16 rej_pdu_ctrl; /* bits 1-8 if U-PDU */ u8 curr_ssv; /* current send state variable val */ u8 curr_rsv; /* current receive state variable */ u8 ind_bits; /* indicator bits set with macro */ } __packed; void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type); void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value); void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit); void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr); void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr); void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit); void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit); void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw); void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr); void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit); #endif /* LLC_PDU_H */ |
15 68 2 2 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0-only /* dummy.c: a dummy net driver The purpose of this driver is to provide a device to point a route through, but not to actually transmit packets. Why? If you have a machine whose only connection is an occasional PPP/SLIP/PLIP link, you can only connect to your own hostname when the link is up. Otherwise you have to use localhost. This isn't very consistent. One solution is to set up a dummy link using PPP/SLIP/PLIP, but this seems (to me) too much overhead for too little gain. This driver provides a small alternative. Thus you can do [when not running slip] ifconfig dummy slip.addr.ess.here up [to go to slip] ifconfig dummy down dip whatever This was written by looking at Donald Becker's skeleton driver and the loopback driver. I then threw away anything that didn't apply! Thanks to Alan Cox for the key clue on what to do with misguided packets. Nick Holloway, 27th May 1994 [I tweaked this explanation a little but that's all] Alan Cox, 30th May 1994 */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #define DRV_NAME "dummy" static int numdummies = 1; /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) { } static void dummy_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->tx_packets, &stats->tx_bytes); } static netdev_tx_t dummy_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); skb_tx_timestamp(skb); dev_kfree_skb(skb); return NETDEV_TX_OK; } static int dummy_dev_init(struct net_device *dev) { dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; netdev_lockdep_set_classes(dev); return 0; } static int dummy_change_carrier(struct net_device *dev, bool new_carrier) { if (new_carrier) netif_carrier_on(dev); else netif_carrier_off(dev); return 0; } static const struct net_device_ops dummy_netdev_ops = { .ndo_init = dummy_dev_init, .ndo_start_xmit = dummy_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_rx_mode = set_multicast_list, .ndo_set_mac_address = eth_mac_addr, .ndo_get_stats64 = dummy_get_stats64, .ndo_change_carrier = dummy_change_carrier, }; static const struct ethtool_ops dummy_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; static void dummy_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &dummy_netdev_ops; dev->ethtool_ops = &dummy_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST; dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA; dev->features |= NETIF_F_GSO_ENCAP_ALL; dev->hw_features |= dev->features; dev->hw_enc_features |= dev->features; eth_hw_addr_random(dev); dev->min_mtu = 0; dev->max_mtu = 0; } static int dummy_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static struct rtnl_link_ops dummy_link_ops __read_mostly = { .kind = DRV_NAME, .setup = dummy_setup, .validate = dummy_validate, }; /* Number of dummy devices to be set up by this module. */ module_param(numdummies, int, 0); MODULE_PARM_DESC(numdummies, "Number of dummy pseudo devices"); static int __init dummy_init_one(void) { struct net_device *dev_dummy; int err; dev_dummy = alloc_netdev(0, "dummy%d", NET_NAME_ENUM, dummy_setup); if (!dev_dummy) return -ENOMEM; dev_dummy->rtnl_link_ops = &dummy_link_ops; err = register_netdevice(dev_dummy); if (err < 0) goto err; return 0; err: free_netdev(dev_dummy); return err; } static int __init dummy_init_module(void) { int i, err = 0; err = rtnl_link_register(&dummy_link_ops); if (err < 0) return err; rtnl_net_lock(&init_net); for (i = 0; i < numdummies && !err; i++) { err = dummy_init_one(); cond_resched(); } rtnl_net_unlock(&init_net); if (err < 0) rtnl_link_unregister(&dummy_link_ops); return err; } static void __exit dummy_cleanup_module(void) { rtnl_link_unregister(&dummy_link_ops); } module_init(dummy_init_module); module_exit(dummy_cleanup_module); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Dummy netdevice driver which discards all packets sent to it"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); |
10 9 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 6 6 1 5 5 4 4 4 4 6 6 5 5 7 1 1 5 1 10 2 8 6 1 6 11 11 11 6 6 1 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 | // SPDX-License-Identifier: GPL-2.0 /* Copyright 2011-2014 Autronica Fire and Security AS * * Author(s): * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * Frame router for HSR and PRP. */ #include "hsr_forward.h" #include <linux/types.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/if_vlan.h> #include "hsr_main.h" #include "hsr_framereg.h" struct hsr_node; /* The uses I can see for these HSR supervision frames are: * 1) Use the frames that are sent after node initialization ("HSR_TLV.Type = * 22") to reset any sequence_nr counters belonging to that node. Useful if * the other node's counter has been reset for some reason. * -- * Or not - resetting the counter and bridging the frame would create a * loop, unfortunately. * * 2) Use the LifeCheck frames to detect ring breaks. I.e. if no LifeCheck * frame is received from a particular node, we know something is wrong. * We just register these (as with normal frames) and throw them away. * * 3) Allow different MAC addresses for the two slave interfaces, using the * MacAddressA field. */ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct ethhdr *eth_hdr; struct hsr_sup_tag *hsr_sup_tag; struct hsrv1_ethhdr_sp *hsr_V1_hdr; struct hsr_sup_tlv *hsr_sup_tlv; u16 total_length = 0; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Correct addr? */ if (!ether_addr_equal(eth_hdr->h_dest, hsr->sup_multicast_addr)) return false; /* Correct ether type?. */ if (!(eth_hdr->h_proto == htons(ETH_P_PRP) || eth_hdr->h_proto == htons(ETH_P_HSR))) return false; /* Get the supervision header from correct location. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ total_length = sizeof(struct hsrv1_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb); if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP)) return false; hsr_sup_tag = &hsr_V1_hdr->hsr_sup; } else { total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length)) return false; hsr_sup_tag = &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup; } if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE && hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; if (hsr_sup_tag->tlv.HSR_TLV_length != 12 && hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* Get next tlv */ total_length += hsr_sup_tag->tlv.HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); /* if this is a redbox supervision frame we need to verify * that more data is available */ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { /* tlv length must be a length of a mac address */ if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload)) return false; /* make sure another tlv follows */ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; if (!pskb_may_pull(skb, total_length)) return false; /* get next tlv */ skb_pull(skb, total_length); hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; skb_push(skb, total_length); } /* end of tlvs must follow at the end */ if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT && hsr_sup_tlv->HSR_TLV_length != 0) return false; return true; } static bool is_proxy_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { struct hsr_sup_payload *payload; struct ethhdr *eth_hdr; u16 total_length = 0; eth_hdr = (struct ethhdr *)skb_mac_header(skb); /* Get the HSR protocol revision. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) total_length = sizeof(struct hsrv1_ethhdr_sp); else total_length = sizeof(struct hsrv0_ethhdr_sp); if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload))) return false; skb_pull(skb, total_length); payload = (struct hsr_sup_payload *)skb->data; skb_push(skb, total_length); /* For RedBox (HSR-SAN) check if we have received the supervision * frame with MAC addresses from own ProxyNodeTable. */ return hsr_is_node_in_db(&hsr->proxy_node_db, payload->macaddress_A); } static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in, struct hsr_frame_info *frame) { struct sk_buff *skb; int copylen; unsigned char *dst, *src; skb_pull(skb_in, HSR_HLEN); skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC); skb_push(skb_in, HSR_HLEN); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start -= HSR_HLEN; copylen = 2 * ETH_ALEN; if (frame->is_vlan) copylen += VLAN_HLEN; src = skb_mac_header(skb_in); dst = skb_mac_header(skb); memcpy(dst, src, copylen); skb->protocol = eth_hdr(skb)->h_proto; return skb; } struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_hsr) frame->skb_std = create_stripped_skb_hsr(frame->skb_hsr, frame); else netdev_warn_once(port->dev, "Unexpected frame received in hsr_get_untagged_frame()\n"); if (!frame->skb_std) return NULL; } return skb_clone(frame->skb_std, GFP_ATOMIC); } struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { if (frame->skb_prp) { /* trim the skb by len - HSR_HLEN to exclude RCT */ skb_trim(frame->skb_prp, frame->skb_prp->len - HSR_HLEN); frame->skb_std = __pskb_copy(frame->skb_prp, skb_headroom(frame->skb_prp), GFP_ATOMIC); } else { /* Unexpected */ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", __FILE__, __LINE__, port->dev->name); return NULL; } } return skb_clone(frame->skb_std, GFP_ATOMIC); } static void prp_set_lan_id(struct prp_rct *trailer, struct hsr_port *port) { int lane_id; if (port->type == HSR_PT_SLAVE_A) lane_id = 0; else lane_id = 1; /* Add net_id in the upper 3 bits of lane_id */ lane_id |= port->hsr->net_id; set_prp_lan_id(trailer, lane_id); } /* Tailroom for PRP rct should have been created before calling this */ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port) { struct prp_rct *trailer; int min_size = ETH_ZLEN; int lsdu_size; if (!skb) return skb; if (frame->is_vlan) min_size = VLAN_ETH_ZLEN; if (skb_put_padto(skb, min_size)) return NULL; trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN); lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; prp_set_lan_id(trailer, port); set_prp_LSDU_size(trailer, lsdu_size); trailer->sequence_nr = htons(frame->sequence_nr); trailer->PRP_suffix = htons(ETH_P_PRP); skb->protocol = eth_hdr(skb)->h_proto; return skb; } static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; if (port->type == HSR_PT_SLAVE_A) path_id = 0; else path_id = 1; set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, struct hsr_port *port, u8 proto_version) { struct hsr_ethhdr *hsr_ethhdr; unsigned char *pc; int lsdu_size; /* pad to minimum packet size which is 60 + 6 (HSR tag) */ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN)) return NULL; lsdu_size = skb->len - 14; if (frame->is_vlan) lsdu_size -= 4; pc = skb_mac_header(skb); if (frame->is_vlan) /* This 4-byte shift (size of a vlan tag) does not * mean that the ethhdr starts there. But rather it * provides the proper environment for accessing * the fields, such as hsr_tag etc., just like * when the vlan tag is not there. This is because * the hsr tag is after the vlan tag. */ hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN); else hsr_ethhdr = (struct hsr_ethhdr *)pc; hsr_set_path_id(hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; hsr_ethhdr->ethhdr.h_proto = htons(proto_version ? ETH_P_HSR : ETH_P_PRP); skb->protocol = hsr_ethhdr->ethhdr.h_proto; return skb; } /* If the original frame was an HSR tagged frame, just clone it to be sent * unchanged. Otherwise, create a private frame especially tagged for 'port'. */ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { unsigned char *dst, *src; struct sk_buff *skb; int movelen; if (frame->skb_hsr) { struct hsr_ethhdr *hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ hsr_set_path_id(hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } /* Create the new skb with enough headroom to fit the HSR tag */ skb = __pskb_copy(frame->skb_std, skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); if (!skb) return NULL; skb_reset_mac_header(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) skb->csum_start += HSR_HLEN; movelen = ETH_HLEN; if (frame->is_vlan) movelen += VLAN_HLEN; src = skb_mac_header(skb); dst = skb_push(skb, HSR_HLEN); memmove(dst, src, movelen); skb_reset_mac_header(skb); /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in * that case */ return hsr_fill_tag(skb, frame, port, port->hsr->prot_version); } struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (frame->skb_prp) { struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp); if (trailer) { prp_set_lan_id(trailer, port); } else { WARN_ONCE(!trailer, "errored PRP skb"); return NULL; } return skb_clone(frame->skb_prp, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); } skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std), skb_tailroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC); return prp_fill_rct(skb, frame, port); } static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, struct hsr_node *node_src) { bool was_multicast_frame; int res, recv_len; was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST); hsr_addr_subst_source(node_src, skb); skb_pull(skb, ETH_HLEN); recv_len = skb->len; res = netif_rx(skb); if (res == NET_RX_DROP) { dev->stats.rx_dropped++; } else { dev->stats.rx_packets++; dev->stats.rx_bytes += recv_len; if (was_multicast_frame) dev->stats.multicast++; } } static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port, struct hsr_frame_info *frame) { if (frame->port_rcv->type == HSR_PT_MASTER) { hsr_addr_subst_dest(frame->node_src, skb, port); /* Address substitution (IEC62439-3 pp 26, 50): replace mac * address of outgoing frame with that of the outgoing slave's. */ ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr); } /* When HSR node is used as RedBox - the frame received from HSR ring * requires source MAC address (SA) replacement to one which can be * recognized by SAN devices (otherwise, frames are dropped by switch) */ if (port->type == HSR_PT_INTERLINK) ether_addr_copy(eth_hdr(skb)->h_source, port->hsr->macaddress_redbox); return dev_queue_xmit(skb); } bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { return ((frame->port_rcv->type == HSR_PT_SLAVE_A && port->type == HSR_PT_SLAVE_B) || (frame->port_rcv->type == HSR_PT_SLAVE_B && port->type == HSR_PT_SLAVE_A)); } bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port) { struct sk_buff *skb; if (port->dev->features & NETIF_F_HW_HSR_FWD) return prp_drop_frame(frame, port); /* RedBox specific frames dropping policies * * Do not send HSR supervisory frames to SAN devices */ if (frame->is_supervision && port->type == HSR_PT_INTERLINK) return true; /* Do not forward to other HSR port (A or B) unicast frames which * are addressed to interlink port (and are in the ProxyNodeTable). */ skb = frame->skb_hsr; if (skb && prp_drop_frame(frame, port) && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } /* Do not forward to port C (Interlink) frames from nodes A and B * if DA is in NodeTable. */ if ((frame->port_rcv->type == HSR_PT_SLAVE_A || frame->port_rcv->type == HSR_PT_SLAVE_B) && port->type == HSR_PT_INTERLINK) { skb = frame->skb_hsr; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->node_db, eth_hdr(skb)->h_dest)) { return true; } } /* Do not forward to port A and B unicast frames received on the * interlink port if it is addressed to one of nodes registered in * the ProxyNodeTable. */ if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) && frame->port_rcv->type == HSR_PT_INTERLINK) { skb = frame->skb_std; if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) && hsr_is_node_in_db(&port->hsr->proxy_node_db, eth_hdr(skb)->h_dest)) { return true; } } return false; } /* Forward the frame through all devices except: * - Back through the receiving device * - If it's a HSR frame: through a device where it has passed before * - if it's a PRP frame: through another PRP slave device (no bridge) * - To the local HSR master only if the frame is directly addressed to it, or * a non-supervision multicast or broadcast frame. * * HSR slave devices should insert a HSR tag into the frame, or forward the * frame unchanged if it's already tagged. Interlink devices should strip HSR * tags if they're of the non-HSR type (but only after duplicate discard). The * master device always strips HSR tags. */ static void hsr_forward_do(struct hsr_frame_info *frame) { struct hsr_port *port; struct sk_buff *skb; bool sent = false; hsr_for_each_port(frame->port_rcv->hsr, port) { struct hsr_priv *hsr = port->hsr; /* Don't send frame back the way it came */ if (port == frame->port_rcv) continue; /* Don't deliver locally unless we should */ if (port->type == HSR_PT_MASTER && !frame->is_local_dest) continue; /* Deliver frames directly addressed to us to master only */ if (port->type != HSR_PT_MASTER && frame->is_local_exclusive) continue; /* If hardware duplicate generation is enabled, only send out * one port. */ if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent) continue; /* Don't send frame over port where it has been sent before. * Also for SAN, this shouldn't be done. */ if (!frame->is_from_san && hsr_register_frame_out(port, frame->node_src, frame->sequence_nr)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER && !frame->is_proxy_supervision) { hsr_handle_sup_frame(frame); continue; } /* Check if frame is to be dropped. Eg. for PRP no forward * between ports, or sending HSR supervision to RedBox. */ if (hsr->proto_ops->drop_frame && hsr->proto_ops->drop_frame(frame, port)) continue; if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) skb = hsr->proto_ops->create_tagged_frame(frame, port); else skb = hsr->proto_ops->get_untagged_frame(frame, port); if (!skb) { frame->port_rcv->dev->stats.rx_dropped++; continue; } skb->dev = port->dev; if (port->type == HSR_PT_MASTER) { hsr_deliver_master(skb, port->dev, frame->node_src); } else { if (!hsr_xmit(skb, port, frame)) if (port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) sent = true; } } } static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb, struct hsr_frame_info *frame) { if (hsr_addr_is_self(hsr, eth_hdr(skb)->h_dest)) { frame->is_local_exclusive = true; skb->pkt_type = PACKET_HOST; } else { frame->is_local_exclusive = false; } if (skb->pkt_type == PACKET_HOST || skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) { frame->is_local_dest = true; } else { frame->is_local_dest = false; } } static void handle_std_frame(struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; frame->skb_hsr = NULL; frame->skb_prp = NULL; frame->skb_std = skb; if (port->type != HSR_PT_MASTER) frame->is_from_san = true; if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { /* Sequence nr for the master/interlink node */ lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; } } int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; /* HSRv0 supervisory frames double as a tag so treat them as tagged. */ if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) || proto == htons(ETH_P_HSR)) { /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return -EINVAL; /* HSR tagged frame :- Data or Supervision */ frame->skb_std = NULL; frame->skb_prp = NULL; frame->skb_hsr = skb; frame->sequence_nr = hsr_get_skb_sequence_nr(skb); return 0; } /* Standard frame or PRP from master port */ handle_std_frame(skb, frame); return 0; } int prp_fill_frame_info(__be16 proto, struct sk_buff *skb, struct hsr_frame_info *frame) { /* Supervision frame */ struct prp_rct *rct = skb_get_PRP_rct(skb); if (rct && prp_check_lsdu_size(skb, rct, frame->is_supervision)) { frame->skb_hsr = NULL; frame->skb_std = NULL; frame->skb_prp = skb; frame->sequence_nr = prp_get_skb_sequence_nr(rct); return 0; } handle_std_frame(skb, frame); return 0; } static int fill_frame_info(struct hsr_frame_info *frame, struct sk_buff *skb, struct hsr_port *port) { struct hsr_priv *hsr = port->hsr; struct hsr_vlan_ethhdr *vlan_hdr; struct list_head *n_db; struct ethhdr *ethhdr; __be16 proto; int ret; /* Check if skb contains ethhdr */ if (skb->mac_len < sizeof(struct ethhdr)) return -EINVAL; memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); if (frame->is_supervision && hsr->redbox) frame->is_proxy_supervision = is_proxy_supervision_frame(port->hsr, skb); n_db = &hsr->node_db; if (port->type == HSR_PT_INTERLINK) n_db = &hsr->proxy_node_db; frame->node_src = hsr_get_node(port, n_db, skb, frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; proto = ethhdr->h_proto; if (proto == htons(ETH_P_8021Q)) frame->is_vlan = true; if (frame->is_vlan) { if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) return -EINVAL; vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } frame->is_from_san = false; frame->port_rcv = port; ret = hsr->proto_ops->fill_frame_info(proto, skb, frame); if (ret) return ret; check_local_dest(port->hsr, skb, frame); return 0; } /* Must be called holding rcu read lock (because of the port parameter) */ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) { struct hsr_frame_info frame; rcu_read_lock(); if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); rcu_read_unlock(); /* Gets called for ingress frames as well as egress from master port. * So check and increment stats for master port only here. */ if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) { port->dev->stats.tx_packets++; port->dev->stats.tx_bytes += skb->len; } kfree_skb(frame.skb_hsr); kfree_skb(frame.skb_prp); kfree_skb(frame.skb_std); return; out_drop: rcu_read_unlock(); port->dev->stats.tx_dropped++; kfree_skb(skb); } |
23 1 1 1 5 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | // SPDX-License-Identifier: GPL-2.0-only /* String matching match for iptables * * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_string.h> #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); MODULE_ALIAS("ebt_string"); static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_string_info *conf = par->matchinfo; bool invert; invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, conf->to_offset, conf->config) != UINT_MAX) ^ invert; } #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) static int string_mt_check(const struct xt_mtchk_param *par) { struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; int flags = TS_AUTOLOAD; /* Damn, can't handle this case properly with iptables... */ if (conf->from_offset > conf->to_offset) return -EINVAL; if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') return -EINVAL; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) return -EINVAL; if (conf->u.v1.flags & ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) return -EINVAL; if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) flags |= TS_IGNORECASE; ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, GFP_KERNEL, flags); if (IS_ERR(ts_conf)) return PTR_ERR(ts_conf); conf->config = ts_conf; return 0; } static void string_mt_destroy(const struct xt_mtdtor_param *par) { textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } static struct xt_match xt_string_mt_reg __read_mostly = { .name = "string", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), .usersize = offsetof(struct xt_string_info, config), .me = THIS_MODULE, }; static int __init string_mt_init(void) { return xt_register_match(&xt_string_mt_reg); } static void __exit string_mt_exit(void) { xt_unregister_match(&xt_string_mt_reg); } module_init(string_mt_init); module_exit(string_mt_exit); |
367 367 368 366 120 251 1806 1806 1809 1800 991 1159 9085 9069 9067 9065 9088 2535 3934 368 2681 3997 2322 2681 2680 2670 997 1809 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> #include <linux/swap.h> #include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt struct page_table_check { atomic_t anon_map_count; atomic_t file_map_count; }; static bool __page_table_check_enabled __initdata = IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); static bool __init need_page_table_check(void) { return __page_table_check_enabled; } static void __init init_page_table_check(void) { if (!__page_table_check_enabled) return; static_branch_disable(&page_table_check_disabled); } struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); return page_ext_data(page_ext, &page_table_check_ops); } /* * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext *page_ext; struct page *page; unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); page_ext = page_ext_get(page); if (!page_ext) return; BUG_ON(PageSlab(page)); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); } page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } /* * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext *page_ext; struct page *page; unsigned long i; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); page_ext = page_ext_get(page); if (!page_ext) return; BUG_ON(PageSlab(page)); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); } page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } /* * page is on free list, or is being allocated, verify that counters are zeroes * crash if they are not. */ void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext *page_ext; unsigned long i; BUG_ON(PageSlab(page)); page_ext = page_ext_get(page); if (!page_ext) return; for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; if (pte_user_accessible_page(pte)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; if (pmd_user_accessible_page(pmd)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; if (pud_user_accessible_page(pud)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ static inline bool swap_cached_writable(swp_entry_t entry) { return is_writable_device_exclusive_entry(entry) || is_writable_device_private_entry(entry) || is_writable_migration_entry(entry); } static inline void page_table_check_pte_flags(pte_t pte) { if (pte_present(pte) && pte_uffd_wp(pte)) WARN_ON_ONCE(pte_write(pte)); else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; if (&init_mm == mm) return; page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { if (pmd_present(pmd) && pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } EXPORT_SYMBOL(__page_table_check_pmd_set); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; __page_table_check_pud_clear(mm, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } EXPORT_SYMBOL(__page_table_check_pud_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (&init_mm == mm) return; if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } pte_unmap(ptep - PTRS_PER_PTE); } } |
3 3 3 3 4 |