Total coverage: 224424 (12%)of 1928100
3 3 3 1 1 1 1 1 1 3 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 // SPDX-License-Identifier: GPL-2.0-only /* * lec.c: Lan Emulation driver * * Marko Kiiskila <mkiiskila@yahoo.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/capability.h> /* We are ethernet device */ #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include <net/arp.h> #include <net/dst.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/seq_file.h> /* And atm device */ #include <linux/atmdev.h> #include <linux/atmlec.h> /* Proxy LEC knows about bridging */ #if IS_ENABLED(CONFIG_BRIDGE) #include "../bridge/br_private.h" static unsigned char bridge_ula_lec[] = { 0x01, 0x80, 0xc2, 0x00, 0x00 }; #endif /* Modular too */ #include <linux/module.h> #include <linux/init.h> /* Hardening for Spectre-v1 */ #include <linux/nospec.h> #include "lec.h" #include "lec_arpc.h" #include "resources.h" #define DUMP_PACKETS 0 /* * 0 = None, * 1 = 30 first bytes * 2 = Whole packet */ #define LEC_UNRES_QUE_LEN 8 /* * number of tx packets to queue for a * single destination while waiting for SVC */ static int lec_open(struct net_device *dev); static netdev_tx_t lec_start_xmit(struct sk_buff *skb, struct net_device *dev); static int lec_close(struct net_device *dev); static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr); static int lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove); /* LANE2 functions */ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_address, const u8 *tlvs, u32 sizeoftlvs); static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, u8 **tlvs, u32 *sizeoftlvs); static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, const u8 *tlvs, u32 sizeoftlvs); static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long permanent); static void lec_arp_check_empties(struct lec_priv *priv, struct atm_vcc *vcc, struct sk_buff *skb); static void lec_arp_destroy(struct lec_priv *priv); static void lec_arp_init(struct lec_priv *priv); static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, const unsigned char *mac_to_find, int is_rdesc, struct lec_arp_table **ret_entry); static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, const unsigned char *atm_addr, unsigned long remoteflag, unsigned int targetless_le_arp); static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id); static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc); static void lec_set_flush_tran_id(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long tran_id); static void lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, struct atm_vcc *vcc, void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb)); static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc); /* must be done under lec_arp_lock */ static inline void lec_arp_hold(struct lec_arp_table *entry) { refcount_inc(&entry->usage); } static inline void lec_arp_put(struct lec_arp_table *entry) { if (refcount_dec_and_test(&entry->usage)) kfree(entry); } static struct lane2_ops lane2_ops = { .resolve = lane2_resolve, /* spec 3.1.3 */ .associate_req = lane2_associate_req, /* spec 3.1.4 */ .associate_indicator = NULL /* spec 3.1.5 */ }; static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; /* Device structures */ static struct net_device *dev_lec[MAX_LEC_ITF]; static DEFINE_MUTEX(lec_mutex); #if IS_ENABLED(CONFIG_BRIDGE) static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) { char *buff; struct lec_priv *priv; /* * Check if this is a BPDU. If so, ask zeppelin to send * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit * as the Config BPDU has */ buff = skb->data + skb->dev->hard_header_len; if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) { struct sock *sk; struct sk_buff *skb2; struct atmlec_msg *mesg; skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (skb2 == NULL) return; skb2->len = sizeof(struct atmlec_msg); mesg = (struct atmlec_msg *)skb2->data; mesg->type = l_topology_change; buff += 4; mesg->content.normal.flag = *buff & 0x01; /* 0x01 is topology change */ priv = netdev_priv(dev); atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); sk->sk_data_ready(sk); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ /* * Open/initialize the netdevice. This is called (in the current kernel) * sometime after booting when the 'ifconfig' program is run. * * This routine should set everything up anew at each open, even * registers that "should" only need to be set once at boot, so that * there is non-reboot way to recover if something goes wrong. */ static int lec_open(struct net_device *dev) { netif_start_queue(dev); return 0; } static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); if (vcc->send(vcc, skb) < 0) { dev->stats.tx_dropped++; return; } dev->stats.tx_packets++; dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) { pr_info("%s\n", dev->name); netif_trans_update(dev); netif_wake_queue(dev); } static netdev_tx_t lec_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *skb2; struct lec_priv *priv = netdev_priv(dev); struct lecdatahdr_8023 *lec_h; struct atm_vcc *vcc; struct lec_arp_table *entry; unsigned char *dst; int min_frame_size; int is_rdesc; pr_debug("called\n"); if (!priv->lecd) { pr_info("%s:No lecd attached\n", dev->name); dev->stats.tx_errors++; netif_stop_queue(dev); kfree_skb(skb); return NETDEV_TX_OK; } pr_debug("skbuff head:%lx data:%lx tail:%lx end:%lx\n", (long)skb->head, (long)skb->data, (long)skb_tail_pointer(skb), (long)skb_end_pointer(skb)); #if IS_ENABLED(CONFIG_BRIDGE) if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) lec_handle_bridge(skb, dev); #endif /* Make sure we have room for lec_id */ if (skb_headroom(skb) < 2) { pr_debug("reallocating skb\n"); skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); if (unlikely(!skb2)) { kfree_skb(skb); return NETDEV_TX_OK; } consume_skb(skb); skb = skb2; } skb_push(skb, 2); /* Put le header to place */ lec_h = (struct lecdatahdr_8023 *)skb->data; lec_h->le_header = htons(priv->lecid); #if DUMP_PACKETS >= 2 #define MAX_DUMP_SKB 99 #elif DUMP_PACKETS >= 1 #define MAX_DUMP_SKB 30 #endif #if DUMP_PACKETS >= 1 printk(KERN_DEBUG "%s: send datalen:%ld lecid:%4.4x\n", dev->name, skb->len, priv->lecid); print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, skb->data, min(skb->len, MAX_DUMP_SKB), true); #endif /* DUMP_PACKETS >= 1 */ /* Minimum ethernet-frame size */ min_frame_size = LEC_MINIMUM_8023_SIZE; if (skb->len < min_frame_size) { if ((skb->len + skb_tailroom(skb)) < min_frame_size) { skb2 = skb_copy_expand(skb, 0, min_frame_size - skb->truesize, GFP_ATOMIC); dev_kfree_skb(skb); if (skb2 == NULL) { dev->stats.tx_dropped++; return NETDEV_TX_OK; } skb = skb2; } skb_put(skb, min_frame_size - skb->len); } /* Send to right vcc */ is_rdesc = 0; dst = lec_h->h_dest; entry = NULL; vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry); pr_debug("%s:vcc:%p vcc_flags:%lx, entry:%p\n", dev->name, vcc, vcc ? vcc->flags : 0, entry); if (!vcc || !test_bit(ATM_VF_READY, &vcc->flags)) { if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) { pr_debug("%s:queuing packet, MAC address %pM\n", dev->name, lec_h->h_dest); skb_queue_tail(&entry->tx_wait, skb); } else { pr_debug("%s:tx queue full or no arp entry, dropping, MAC address: %pM\n", dev->name, lec_h->h_dest); dev->stats.tx_dropped++; dev_kfree_skb(skb); } goto out; } #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s:sending to vpi:%d vci:%d\n", dev->name, vcc->vpi, vcc->vci); #endif /* DUMP_PACKETS > 0 */ while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) { pr_debug("emptying tx queue, MAC address %pM\n", lec_h->h_dest); lec_send(vcc, skb2); } lec_send(vcc, skb); if (!atm_may_send(vcc, 0)) { struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); vpriv->xoff = 1; netif_stop_queue(dev); /* * vcc->pop() might have occurred in between, making * the vcc usuable again. Since xmit is serialized, * this is the only situation we have to re-test. */ if (atm_may_send(vcc, 0)) netif_wake_queue(dev); } out: if (entry) lec_arp_put(entry); netif_trans_update(dev); return NETDEV_TX_OK; } /* The inverse routine to net_open(). */ static int lec_close(struct net_device *dev) { netif_stop_queue(dev); return 0; } static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) { static const u8 zero_addr[ETH_ALEN] = {}; unsigned long flags; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); struct atmlec_msg *mesg; struct lec_arp_table *entry; char *tmp; /* FIXME */ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); mesg = (struct atmlec_msg *)skb->data; tmp = skb->data; tmp += sizeof(struct atmlec_msg); pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type); switch (mesg->type) { case l_set_mac_addr: eth_hw_addr_set(dev, mesg->content.normal.mac_addr); break; case l_del_mac_addr: eth_hw_addr_set(dev, zero_addr); break; case l_addr_delete: lec_addr_delete(priv, mesg->content.normal.atm_addr, mesg->content.normal.flag); break; case l_topology_change: priv->topology_change = mesg->content.normal.flag; break; case l_flush_complete: lec_flush_complete(priv, mesg->content.normal.flag); break; case l_narp_req: /* LANE2: see 7.1.35 in the lane2 spec */ spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mesg->content.normal.mac_addr); lec_arp_remove(priv, entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (mesg->content.normal.no_source_le_narp) break; fallthrough; case l_arp_update: lec_arp_update(priv, mesg->content.normal.mac_addr, mesg->content.normal.atm_addr, mesg->content.normal.flag, mesg->content.normal.targetless_le_arp); pr_debug("in l_arp_update\n"); if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */ pr_debug("LANE2 3.1.5, got tlvs, size %d\n", mesg->sizeoftlvs); lane2_associate_ind(dev, mesg->content.normal.mac_addr, tmp, mesg->sizeoftlvs); } break; case l_config: priv->maximum_unknown_frame_count = mesg->content.config.maximum_unknown_frame_count; priv->max_unknown_frame_time = (mesg->content.config.max_unknown_frame_time * HZ); priv->max_retry_count = mesg->content.config.max_retry_count; priv->aging_time = (mesg->content.config.aging_time * HZ); priv->forward_delay_time = (mesg->content.config.forward_delay_time * HZ); priv->arp_response_time = (mesg->content.config.arp_response_time * HZ); priv->flush_timeout = (mesg->content.config.flush_timeout * HZ); priv->path_switching_delay = (mesg->content.config.path_switching_delay * HZ); priv->lane_version = mesg->content.config.lane_version; /* LANE2 */ priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; rtnl_lock(); if (dev_set_mtu(dev, mesg->content.config.mtu)) pr_info("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); rtnl_unlock(); priv->is_proxy = mesg->content.config.is_proxy; break; case l_flush_tran_id: lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr, mesg->content.normal.flag); break; case l_set_lecid: priv->lecid = (unsigned short)(0xffff & mesg->content.normal.flag); break; case l_should_bridge: #if IS_ENABLED(CONFIG_BRIDGE) { pr_debug("%s: bridge zeppelin asks about %pM\n", dev->name, mesg->content.proxy.mac_addr); if (br_fdb_test_addr_hook == NULL) break; if (br_fdb_test_addr_hook(dev, mesg->content.proxy.mac_addr)) { /* hit from bridge table, send LE_ARP_RESPONSE */ struct sk_buff *skb2; struct sock *sk; pr_debug("%s: entry found, responding to zeppelin\n", dev->name); skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (skb2 == NULL) break; skb2->len = sizeof(struct atmlec_msg); skb_copy_to_linear_data(skb2, mesg, sizeof(*mesg)); atm_force_charge(priv->lecd, skb2->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb2); sk->sk_data_ready(sk); } } #endif /* IS_ENABLED(CONFIG_BRIDGE) */ break; default: pr_info("%s: Unknown message type %d\n", dev->name, mesg->type); dev_kfree_skb(skb); return -EINVAL; } dev_kfree_skb(skb); return 0; } static void lec_atm_close(struct atm_vcc *vcc) { struct sk_buff *skb; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); priv->lecd = NULL; /* Do something needful? */ netif_stop_queue(dev); lec_arp_destroy(priv); if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) pr_info("%s closing with messages pending\n", dev->name); while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { atm_return(vcc, skb->truesize); dev_kfree_skb(skb); } pr_info("%s: Shut down!\n", dev->name); module_put(THIS_MODULE); } static const struct atmdev_ops lecdev_ops = { .close = lec_atm_close, .send = lec_atm_send }; static struct atm_dev lecatm_dev = { .ops = &lecdev_ops, .type = "lec", .number = 999, /* dummy device number */ .lock = __SPIN_LOCK_UNLOCKED(lecatm_dev.lock) }; /* * LANE2: new argument struct sk_buff *data contains * the LE_ARP based TLVs introduced in the LANE2 spec */ static int send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, const unsigned char *mac_addr, const unsigned char *atm_addr, struct sk_buff *data) { struct sock *sk; struct sk_buff *skb; struct atmlec_msg *mesg; if (!priv || !priv->lecd) return -1; skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); if (!skb) return -1; skb->len = sizeof(struct atmlec_msg); mesg = (struct atmlec_msg *)skb->data; memset(mesg, 0, sizeof(struct atmlec_msg)); mesg->type = type; if (data != NULL) mesg->sizeoftlvs = data->len; if (mac_addr) ether_addr_copy(mesg->content.normal.mac_addr, mac_addr); else mesg->content.normal.targetless_le_arp = 1; if (atm_addr) memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN); atm_force_charge(priv->lecd, skb->truesize); sk = sk_atm(priv->lecd); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); if (data != NULL) { pr_debug("about to send %d bytes of data\n", data->len); atm_force_charge(priv->lecd, data->truesize); skb_queue_tail(&sk->sk_receive_queue, data); sk->sk_data_ready(sk); } return 0; } static void lec_set_multicast_list(struct net_device *dev) { /* * by default, all multicast frames arrive over the bus. * eventually support selective multicast service */ } static const struct net_device_ops lec_netdev_ops = { .ndo_open = lec_open, .ndo_stop = lec_close, .ndo_start_xmit = lec_start_xmit, .ndo_tx_timeout = lec_tx_timeout, .ndo_set_rx_mode = lec_set_multicast_list, }; static const unsigned char lec_ctrl_magic[] = { 0xff, 0x00, 0x01, 0x01 }; #define LEC_DATA_DIRECT_8023 2 #define LEC_DATA_DIRECT_8025 3 static int lec_is_data_direct(struct atm_vcc *vcc) { return ((vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8023) || (vcc->sap.blli[0].l3.tr9577.snap[4] == LEC_DATA_DIRECT_8025)); } static void lec_push(struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s: vcc vpi:%d vci:%d\n", dev->name, vcc->vpi, vcc->vci); #endif if (!skb) { pr_debug("%s: null skb\n", dev->name); lec_vcc_close(priv, vcc); return; } #if DUMP_PACKETS >= 2 #define MAX_SKB_DUMP 99 #elif DUMP_PACKETS >= 1 #define MAX_SKB_DUMP 30 #endif #if DUMP_PACKETS > 0 printk(KERN_DEBUG "%s: rcv datalen:%ld lecid:%4.4x\n", dev->name, skb->len, priv->lecid); print_hex_dump(KERN_DEBUG, "", DUMP_OFFSET, 16, 1, skb->data, min(MAX_SKB_DUMP, skb->len), true); #endif /* DUMP_PACKETS > 0 */ if (memcmp(skb->data, lec_ctrl_magic, 4) == 0) { /* Control frame, to daemon */ struct sock *sk = sk_atm(vcc); pr_debug("%s: To daemon\n", dev->name); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); } else { /* Data frame, queue to protocol handlers */ struct lec_arp_table *entry; unsigned char *src, *dst; atm_return(vcc, skb->truesize); if (*(__be16 *) skb->data == htons(priv->lecid) || !priv->lecd || !(dev->flags & IFF_UP)) { /* * Probably looping back, or if lecd is missing, * lecd has gone down */ pr_debug("Ignoring frame...\n"); dev_kfree_skb(skb); return; } dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest; /* * If this is a Data Direct VCC, and the VCC does not match * the LE_ARP cache entry, delete the LE_ARP cache entry. */ spin_lock_irqsave(&priv->lec_arp_lock, flags); if (lec_is_data_direct(vcc)) { src = ((struct lecdatahdr_8023 *)skb->data)->h_source; entry = lec_arp_find(priv, src); if (entry && entry->vcc != vcc) { lec_arp_remove(priv, entry); lec_arp_put(entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (!(dst[0] & 0x01) && /* Never filter Multi/Broadcast */ !priv->is_proxy && /* Proxy wants all the packets */ memcmp(dst, dev->dev_addr, dev->addr_len)) { dev_kfree_skb(skb); return; } if (!hlist_empty(&priv->lec_arp_empty_ones)) lec_arp_check_empties(priv, vcc, skb); skb_pull(skb, 2); /* skip lec_id */ skb->protocol = eth_type_trans(skb, dev); dev->stats.rx_packets++; dev->stats.rx_bytes += skb->len; memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); netif_rx(skb); } } static void lec_pop(struct atm_vcc *vcc, struct sk_buff *skb) { struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = skb->dev; if (vpriv == NULL) { pr_info("vpriv = NULL!?!?!?\n"); return; } vpriv->old_pop(vcc, skb); if (vpriv->xoff && atm_may_send(vcc, 0)) { vpriv->xoff = 0; if (netif_running(dev) && netif_queue_stopped(dev)) netif_wake_queue(dev); } } static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) { struct lec_vcc_priv *vpriv; int bytes_left; struct atmlec_ioc ioc_data; lockdep_assert_held(&lec_mutex); /* Lecd must be up in this case */ bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); if (bytes_left != 0) pr_info("copy from user failed for %d bytes\n", bytes_left); if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF) return -EINVAL; ioc_data.dev_num = array_index_nospec(ioc_data.dev_num, MAX_LEC_ITF); if (!dev_lec[ioc_data.dev_num]) return -EINVAL; vpriv = kmalloc_obj(struct lec_vcc_priv); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; vpriv->old_pop = vcc->pop; vcc->user_back = vpriv; vcc->pop = lec_pop; lec_vcc_added(netdev_priv(dev_lec[ioc_data.dev_num]), &ioc_data, vcc, vcc->push); vcc->proto_data = dev_lec[ioc_data.dev_num]; vcc->push = lec_push; return 0; } static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { lockdep_assert_held(&lec_mutex); if (arg < 0 || arg >= MAX_LEC_ITF) return -EINVAL; arg = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[arg]) return -EINVAL; vcc->proto_data = dev_lec[arg]; return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); } /* Initialize device. */ static int lecd_attach(struct atm_vcc *vcc, int arg) { int i; struct lec_priv *priv; lockdep_assert_held(&lec_mutex); if (arg < 0) arg = 0; if (arg >= MAX_LEC_ITF) return -EINVAL; i = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[i]) { int size; size = sizeof(struct lec_priv); dev_lec[i] = alloc_etherdev(size); if (!dev_lec[i]) return -ENOMEM; dev_lec[i]->netdev_ops = &lec_netdev_ops; dev_lec[i]->max_mtu = 18190; snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); if (register_netdev(dev_lec[i])) { free_netdev(dev_lec[i]); dev_lec[i] = NULL; return -EINVAL; } priv = netdev_priv(dev_lec[i]); } else { priv = netdev_priv(dev_lec[i]); if (priv->lecd) return -EADDRINUSE; } lec_arp_init(priv); priv->itfnum = i; /* LANE2 addition */ priv->lecd = vcc; vcc->dev = &lecatm_dev; vcc_insert_socket(sk_atm(vcc)); vcc->proto_data = dev_lec[i]; set_bit(ATM_VF_META, &vcc->flags); set_bit(ATM_VF_READY, &vcc->flags); /* Set default values to these variables */ priv->maximum_unknown_frame_count = 1; priv->max_unknown_frame_time = (1 * HZ); priv->vcc_timeout_period = (1200 * HZ); priv->max_retry_count = 1; priv->aging_time = (300 * HZ); priv->forward_delay_time = (15 * HZ); priv->topology_change = 0; priv->arp_response_time = (1 * HZ); priv->flush_timeout = (4 * HZ); priv->path_switching_delay = (6 * HZ); if (dev_lec[i]->flags & IFF_UP) netif_start_queue(dev_lec[i]); __module_get(THIS_MODULE); return i; } #ifdef CONFIG_PROC_FS static const char *lec_arp_get_status_string(unsigned char status) { static const char *const lec_arp_status_string[] = { "ESI_UNKNOWN ", "ESI_ARP_PENDING ", "ESI_VC_PENDING ", "<Undefined> ", "ESI_FLUSH_PENDING ", "ESI_FORWARD_DIRECT" }; if (status > ESI_FORWARD_DIRECT) status = 3; /* ESI_UNDEFINED */ return lec_arp_status_string[status]; } static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) { seq_printf(seq, "%pM ", entry->mac_addr); seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr); seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status), entry->flags & 0xffff); if (entry->vcc) seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); else seq_printf(seq, " "); if (entry->recv_vcc) { seq_printf(seq, " %3d %3d", entry->recv_vcc->vpi, entry->recv_vcc->vci); } seq_putc(seq, '\n'); } struct lec_state { unsigned long flags; struct lec_priv *locked; struct hlist_node *node; struct net_device *dev; int itf; int arp_table; int misc_table; }; static void *lec_tbl_walk(struct lec_state *state, struct hlist_head *tbl, loff_t *l) { struct hlist_node *e = state->node; if (!e) e = tbl->first; if (e == SEQ_START_TOKEN) { e = tbl->first; --*l; } for (; e; e = e->next) { if (--*l < 0) break; } state->node = e; return (*l < 0) ? state : NULL; } static void *lec_arp_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { void *v = NULL; int p; for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) { v = lec_tbl_walk(state, &priv->lec_arp_tables[p], l); if (v) break; } state->arp_table = p; return v; } static void *lec_misc_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { struct hlist_head *lec_misc_tables[] = { &priv->lec_arp_empty_ones, &priv->lec_no_forward, &priv->mcast_fwds }; void *v = NULL; int q; for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) { v = lec_tbl_walk(state, lec_misc_tables[q], l); if (v) break; } state->misc_table = q; return v; } static void *lec_priv_walk(struct lec_state *state, loff_t *l, struct lec_priv *priv) { if (!state->locked) { state->locked = priv; spin_lock_irqsave(&priv->lec_arp_lock, state->flags); } if (!lec_arp_walk(state, l, priv) && !lec_misc_walk(state, l, priv)) { spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags); state->locked = NULL; /* Partial state reset for the next time we get called */ state->arp_table = state->misc_table = 0; } return state->locked; } static void *lec_itf_walk(struct lec_state *state, loff_t *l) { struct net_device *dev; void *v; dev = state->dev ? state->dev : dev_lec[state->itf]; v = (dev && netdev_priv(dev)) ? lec_priv_walk(state, l, netdev_priv(dev)) : NULL; if (!v && dev) { /* Partial state reset for the next time we get called */ dev = NULL; } state->dev = dev; return v; } static void *lec_get_idx(struct lec_state *state, loff_t l) { void *v = NULL; for (; state->itf < MAX_LEC_ITF; state->itf++) { v = lec_itf_walk(state, &l); if (v) break; } return v; } static void *lec_seq_start(struct seq_file *seq, loff_t *pos) { struct lec_state *state = seq->private; mutex_lock(&lec_mutex); state->itf = 0; state->dev = NULL; state->locked = NULL; state->arp_table = 0; state->misc_table = 0; state->node = SEQ_START_TOKEN; return *pos ? lec_get_idx(state, *pos) : SEQ_START_TOKEN; } static void lec_seq_stop(struct seq_file *seq, void *v) { struct lec_state *state = seq->private; if (state->dev) { spin_unlock_irqrestore(&state->locked->lec_arp_lock, state->flags); state->dev = NULL; } mutex_unlock(&lec_mutex); } static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct lec_state *state = seq->private; ++*pos; return lec_get_idx(state, 1); } static int lec_seq_show(struct seq_file *seq, void *v) { static const char lec_banner[] = "Itf MAC ATM destination" " Status Flags " "VPI/VCI Recv VPI/VCI\n"; if (v == SEQ_START_TOKEN) seq_puts(seq, lec_banner); else { struct lec_state *state = seq->private; struct net_device *dev = state->dev; struct lec_arp_table *entry = hlist_entry(state->node, struct lec_arp_table, next); seq_printf(seq, "%s ", dev->name); lec_info(seq, entry); } return 0; } static const struct seq_operations lec_seq_ops = { .start = lec_seq_start, .next = lec_seq_next, .stop = lec_seq_stop, .show = lec_seq_show, }; #endif static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct atm_vcc *vcc = ATM_SD(sock); int err = 0; switch (cmd) { case ATMLEC_CTRL: case ATMLEC_MCAST: case ATMLEC_DATA: if (!capable(CAP_NET_ADMIN)) return -EPERM; break; default: return -ENOIOCTLCMD; } mutex_lock(&lec_mutex); switch (cmd) { case ATMLEC_CTRL: err = lecd_attach(vcc, (int)arg); if (err >= 0) sock->state = SS_CONNECTED; break; case ATMLEC_MCAST: err = lec_mcast_attach(vcc, (int)arg); break; case ATMLEC_DATA: err = lec_vcc_attach(vcc, (void __user *)arg); break; } mutex_unlock(&lec_mutex); return err; } static struct atm_ioctl lane_ioctl_ops = { .owner = THIS_MODULE, .ioctl = lane_ioctl, }; static int __init lane_module_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *p; p = proc_create_seq_private("lec", 0444, atm_proc_root, &lec_seq_ops, sizeof(struct lec_state), NULL); if (!p) { pr_err("Unable to initialize /proc/net/atm/lec\n"); return -ENOMEM; } #endif register_atm_ioctl(&lane_ioctl_ops); pr_info("lec.c: initialized\n"); return 0; } static void __exit lane_module_cleanup(void) { int i; #ifdef CONFIG_PROC_FS remove_proc_entry("lec", atm_proc_root); #endif deregister_atm_ioctl(&lane_ioctl_ops); for (i = 0; i < MAX_LEC_ITF; i++) { if (dev_lec[i] != NULL) { unregister_netdev(dev_lec[i]); free_netdev(dev_lec[i]); dev_lec[i] = NULL; } } } module_init(lane_module_init); module_exit(lane_module_cleanup); /* * LANE2: 3.1.3, LE_RESOLVE.request * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs. * If sizeoftlvs == NULL the default TLVs associated with this * lec will be used. * If dst_mac == NULL, targetless LE_ARP will be sent */ static int lane2_resolve(struct net_device *dev, const u8 *dst_mac, int force, u8 **tlvs, u32 *sizeoftlvs) { unsigned long flags; struct lec_priv *priv = netdev_priv(dev); struct lec_arp_table *table; struct sk_buff *skb; int retval; if (force == 0) { spin_lock_irqsave(&priv->lec_arp_lock, flags); table = lec_arp_find(priv, dst_mac); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); if (table == NULL) return -1; *tlvs = kmemdup(table->tlvs, table->sizeoftlvs, GFP_ATOMIC); if (*tlvs == NULL) return -1; *sizeoftlvs = table->sizeoftlvs; return 0; } if (sizeoftlvs == NULL) retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL); else { skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC); if (skb == NULL) return -1; skb->len = *sizeoftlvs; skb_copy_to_linear_data(skb, *tlvs, *sizeoftlvs); retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb); } return retval; } /* * LANE2: 3.1.4, LE_ASSOCIATE.request * Associate the *tlvs with the *lan_dst address. * Will overwrite any previous association * Returns 1 for success, 0 for failure (out of memory) * */ static int lane2_associate_req(struct net_device *dev, const u8 *lan_dst, const u8 *tlvs, u32 sizeoftlvs) { int retval; struct sk_buff *skb; struct lec_priv *priv = netdev_priv(dev); if (!ether_addr_equal(lan_dst, dev->dev_addr)) return 0; /* not our mac address */ kfree(priv->tlvs); /* NULL if there was no previous association */ priv->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); if (priv->tlvs == NULL) return 0; priv->sizeoftlvs = sizeoftlvs; skb = alloc_skb(sizeoftlvs, GFP_ATOMIC); if (skb == NULL) return 0; skb->len = sizeoftlvs; skb_copy_to_linear_data(skb, tlvs, sizeoftlvs); retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb); if (retval != 0) pr_info("lec.c: lane2_associate_req() failed\n"); /* * If the previous association has changed we must * somehow notify other LANE entities about the change */ return 1; } /* * LANE2: 3.1.5, LE_ASSOCIATE.indication * */ static void lane2_associate_ind(struct net_device *dev, const u8 *mac_addr, const u8 *tlvs, u32 sizeoftlvs) { #if 0 int i = 0; #endif struct lec_priv *priv = netdev_priv(dev); #if 0 /* * Why have the TLVs in LE_ARP entries * since we do not use them? When you * uncomment this code, make sure the * TLVs get freed when entry is killed */ struct lec_arp_table *entry = lec_arp_find(priv, mac_addr); if (entry == NULL) return; /* should not happen */ kfree(entry->tlvs); entry->tlvs = kmemdup(tlvs, sizeoftlvs, GFP_KERNEL); if (entry->tlvs == NULL) return; entry->sizeoftlvs = sizeoftlvs; #endif #if 0 pr_info("\n"); pr_info("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs); while (i < sizeoftlvs) pr_cont("%02x ", tlvs[i++]); pr_cont("\n"); #endif /* tell MPOA about the TLVs we saw */ if (priv->lane2_ops && priv->lane2_ops->associate_indicator) { priv->lane2_ops->associate_indicator(dev, mac_addr, tlvs, sizeoftlvs); } } /* * Here starts what used to lec_arpc.c * * lec_arpc.c was added here when making * lane client modular. October 1997 */ #include <linux/types.h> #include <linux/timer.h> #include <linux/param.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <net/route.h> #if 0 #define pr_debug(format, args...) /* #define pr_debug printk */ #endif #define DEBUG_ARP_TABLE 0 #define LEC_ARP_REFRESH_INTERVAL (3*HZ) static void lec_arp_check_expire(struct work_struct *work); static void lec_arp_expire_arp(struct timer_list *t); /* * Arp table funcs */ #define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE - 1)) /* * Initialization of arp-cache */ static void lec_arp_init(struct lec_priv *priv) { unsigned short i; for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); INIT_HLIST_HEAD(&priv->lec_no_forward); INIT_HLIST_HEAD(&priv->mcast_fwds); spin_lock_init(&priv->lec_arp_lock); INIT_DELAYED_WORK(&priv->lec_arp_work, lec_arp_check_expire); schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); } static void lec_arp_clear_vccs(struct lec_arp_table *entry) { if (entry->vcc) { struct atm_vcc *vcc = entry->vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); struct net_device *dev = (struct net_device *)vcc->proto_data; if (vpriv) { vcc->pop = vpriv->old_pop; if (vpriv->xoff) netif_wake_queue(dev); kfree(vpriv); vcc->user_back = NULL; vcc->push = entry->old_push; vcc_release_async(vcc, -EPIPE); } entry->vcc = NULL; } if (entry->recv_vcc) { struct atm_vcc *vcc = entry->recv_vcc; struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); if (vpriv) { kfree(vpriv); vcc->user_back = NULL; entry->recv_vcc->push = entry->old_recv_push; vcc_release_async(entry->recv_vcc, -EPIPE); } entry->recv_vcc = NULL; } } /* * Insert entry to lec_arp_table * LANE2: Add to the end of the list to satisfy 8.1.13 */ static inline void lec_arp_add(struct lec_priv *priv, struct lec_arp_table *entry) { struct hlist_head *tmp; tmp = &priv->lec_arp_tables[HASH(entry->mac_addr[ETH_ALEN - 1])]; hlist_add_head(&entry->next, tmp); pr_debug("Added entry:%pM\n", entry->mac_addr); } /* * Remove entry from lec_arp_table */ static int lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) { struct lec_arp_table *entry; int i, remove_vcc = 1; if (!to_remove) return -1; hlist_del(&to_remove->next); timer_delete(&to_remove->timer); /* * If this is the only MAC connected to this VCC, * also tear down the VCC */ if (to_remove->status >= ESI_FLUSH_PENDING) { /* * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT */ for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp(to_remove->atm_addr, entry->atm_addr, ATM_ESA_LEN) == 0) { remove_vcc = 0; break; } } } if (remove_vcc) lec_arp_clear_vccs(to_remove); } skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */ pr_debug("Removed entry:%pM\n", to_remove->mac_addr); return 0; } #if DEBUG_ARP_TABLE static const char *get_status_string(unsigned char st) { switch (st) { case ESI_UNKNOWN: return "ESI_UNKNOWN"; case ESI_ARP_PENDING: return "ESI_ARP_PENDING"; case ESI_VC_PENDING: return "ESI_VC_PENDING"; case ESI_FLUSH_PENDING: return "ESI_FLUSH_PENDING"; case ESI_FORWARD_DIRECT: return "ESI_FORWARD_DIRECT"; } return "<UNKNOWN>"; } static void dump_arp_table(struct lec_priv *priv) { struct lec_arp_table *rulla; char buf[256]; int i, offset; pr_info("Dump %p:\n", priv); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(rulla, &priv->lec_arp_tables[i], next) { offset = 0; offset += sprintf(buf, "%d: %p\n", i, rulla); offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc-> vpi : 0, rulla->recv_vcc ? rulla->recv_vcc-> vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } } if (!hlist_empty(&priv->lec_no_forward)) pr_info("No forward\n"); hlist_for_each_entry(rulla, &priv->lec_no_forward, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } if (!hlist_empty(&priv->lec_arp_empty_ones)) pr_info("Empty ones\n"); hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s", buf); } if (!hlist_empty(&priv->mcast_fwds)) pr_info("Multicast Forward VCCs\n"); hlist_for_each_entry(rulla, &priv->mcast_fwds, next) { offset = 0; offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr); offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN, rulla->atm_addr); offset += sprintf(buf + offset, "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", rulla->vcc ? rulla->vcc->vpi : 0, rulla->vcc ? rulla->vcc->vci : 0, rulla->recv_vcc ? rulla->recv_vcc->vpi : 0, rulla->recv_vcc ? rulla->recv_vcc->vci : 0, rulla->last_used, rulla->timestamp, rulla->no_tries); offset += sprintf(buf + offset, "Flags:%x, Packets_flooded:%x, Status: %s ", rulla->flags, rulla->packets_flooded, get_status_string(rulla->status)); pr_info("%s\n", buf); } } #else #define dump_arp_table(priv) do { } while (0) #endif /* * Destruction of arp-cache */ static void lec_arp_destroy(struct lec_priv *priv) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; cancel_delayed_work_sync(&priv->lec_arp_work); /* * Remove all entries */ spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { lec_arp_remove(priv, entry); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_arp_tables[i]); } hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { timer_delete_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_arp_empty_ones); hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { timer_delete_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->lec_no_forward); hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); } INIT_HLIST_HEAD(&priv->mcast_fwds); priv->mcast_vcc = NULL; spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } /* * Find entry by mac_address */ static struct lec_arp_table *lec_arp_find(struct lec_priv *priv, const unsigned char *mac_addr) { struct hlist_head *head; struct lec_arp_table *entry; pr_debug("%pM\n", mac_addr); head = &priv->lec_arp_tables[HASH(mac_addr[ETH_ALEN - 1])]; hlist_for_each_entry(entry, head, next) { if (ether_addr_equal(mac_addr, entry->mac_addr)) return entry; } return NULL; } static struct lec_arp_table *make_entry(struct lec_priv *priv, const unsigned char *mac_addr) { struct lec_arp_table *to_return; to_return = kzalloc_obj(struct lec_arp_table, GFP_ATOMIC); if (!to_return) return NULL; ether_addr_copy(to_return->mac_addr, mac_addr); INIT_HLIST_NODE(&to_return->next); timer_setup(&to_return->timer, lec_arp_expire_arp, 0); to_return->last_used = jiffies; to_return->priv = priv; skb_queue_head_init(&to_return->tx_wait); refcount_set(&to_return->usage, 1); return to_return; } /* Arp sent timer expired */ static void lec_arp_expire_arp(struct timer_list *t) { struct lec_arp_table *entry; entry = timer_container_of(entry, t, timer); pr_debug("\n"); if (entry->status == ESI_ARP_PENDING) { if (entry->no_tries <= entry->priv->max_retry_count) { if (entry->is_rdesc) send_to_lecd(entry->priv, l_rdesc_arp_xmt, entry->mac_addr, NULL, NULL); else send_to_lecd(entry->priv, l_arp_xmt, entry->mac_addr, NULL, NULL); entry->no_tries++; } mod_timer(&entry->timer, jiffies + (1 * HZ)); } } /* Unknown/unused vcc expire, remove associated entry */ static void lec_arp_expire_vcc(struct timer_list *t) { unsigned long flags; struct lec_arp_table *to_remove = timer_container_of(to_remove, t, timer); struct lec_priv *priv = to_remove->priv; timer_delete(&to_remove->timer); pr_debug("%p %p: vpi:%d vci:%d\n", to_remove, priv, to_remove->vcc ? to_remove->recv_vcc->vpi : 0, to_remove->vcc ? to_remove->recv_vcc->vci : 0); spin_lock_irqsave(&priv->lec_arp_lock, flags); hlist_del(&to_remove->next); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); lec_arp_clear_vccs(to_remove); lec_arp_put(to_remove); } static bool __lec_arp_check_expire(struct lec_arp_table *entry, unsigned long now, struct lec_priv *priv) { unsigned long time_to_check; if ((entry->flags) & LEC_REMOTE_FLAG && priv->topology_change) time_to_check = priv->forward_delay_time; else time_to_check = priv->aging_time; pr_debug("About to expire: %lx - %lx > %lx\n", now, entry->last_used, time_to_check); if (time_after(now, entry->last_used + time_to_check) && !(entry->flags & LEC_PERMANENT_FLAG) && !(entry->mac_addr[0] & 0x01)) { /* LANE2: 7.1.20 */ /* Remove entry */ pr_debug("Entry timed out\n"); lec_arp_remove(priv, entry); lec_arp_put(entry); } else { /* Something else */ if ((entry->status == ESI_VC_PENDING || entry->status == ESI_ARP_PENDING) && time_after_eq(now, entry->timestamp + priv->max_unknown_frame_time)) { entry->timestamp = jiffies; entry->packets_flooded = 0; if (entry->status == ESI_VC_PENDING) send_to_lecd(priv, l_svc_setup, entry->mac_addr, entry->atm_addr, NULL); } if (entry->status == ESI_FLUSH_PENDING && time_after_eq(now, entry->timestamp + priv->path_switching_delay)) { lec_arp_hold(entry); return true; } } return false; } /* * Expire entries. * 1. Re-set timer * 2. For each entry, delete entries that have aged past the age limit. * 3. For each entry, depending on the status of the entry, perform * the following maintenance. * a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the * tick_count is above the max_unknown_frame_time, clear * the tick_count to zero and clear the packets_flooded counter * to zero. This supports the packet rate limit per address * while flooding unknowns. * b. If the status is ESI_FLUSH_PENDING and the tick_count is greater * than or equal to the path_switching_delay, change the status * to ESI_FORWARD_DIRECT. This causes the flush period to end * regardless of the progress of the flush protocol. */ static void lec_arp_check_expire(struct work_struct *work) { unsigned long flags; struct lec_priv *priv = container_of(work, struct lec_priv, lec_arp_work.work); struct hlist_node *next; struct lec_arp_table *entry; unsigned long now; int i; pr_debug("%p\n", priv); now = jiffies; restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (__lec_arp_check_expire(entry, now, priv)) { struct sk_buff *skb; struct atm_vcc *vcc = entry->vcc; spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait))) lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); goto restart; } } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); schedule_delayed_work(&priv->lec_arp_work, LEC_ARP_REFRESH_INTERVAL); } /* * Try to find vcc where mac_address is attached. * */ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv, const unsigned char *mac_to_find, int is_rdesc, struct lec_arp_table **ret_entry) { unsigned long flags; struct lec_arp_table *entry; struct atm_vcc *found; if (mac_to_find[0] & 0x01) { switch (priv->lane_version) { case 1: return priv->mcast_vcc; case 2: /* LANE2 wants arp for multicast addresses */ if (ether_addr_equal(mac_to_find, bus_mac)) return priv->mcast_vcc; break; default: break; } } spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mac_to_find); if (entry) { if (entry->status == ESI_FORWARD_DIRECT) { /* Connection Ok */ entry->last_used = jiffies; lec_arp_hold(entry); *ret_entry = entry; found = entry->vcc; goto out; } /* * If the LE_ARP cache entry is still pending, reset count to 0 * so another LE_ARP request can be made for this frame. */ if (entry->status == ESI_ARP_PENDING) entry->no_tries = 0; /* * Data direct VC not yet set up, check to see if the unknown * frame count is greater than the limit. If the limit has * not been reached, allow the caller to send packet to * BUS. */ if (entry->status != ESI_FLUSH_PENDING && entry->packets_flooded < priv->maximum_unknown_frame_count) { entry->packets_flooded++; pr_debug("Flooding..\n"); found = priv->mcast_vcc; goto out; } /* * We got here because entry->status == ESI_FLUSH_PENDING * or BUS flood limit was reached for an entry which is * in ESI_ARP_PENDING or ESI_VC_PENDING state. */ lec_arp_hold(entry); *ret_entry = entry; pr_debug("entry->status %d entry->vcc %p\n", entry->status, entry->vcc); found = NULL; } else { /* No matching entry was found */ entry = make_entry(priv, mac_to_find); pr_debug("Making entry\n"); if (!entry) { found = priv->mcast_vcc; goto out; } lec_arp_add(priv, entry); /* We want arp-request(s) to be sent */ entry->packets_flooded = 1; entry->status = ESI_ARP_PENDING; entry->no_tries = 1; entry->last_used = entry->timestamp = jiffies; entry->is_rdesc = is_rdesc; if (entry->is_rdesc) send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL, NULL); else send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); entry->timer.expires = jiffies + (1 * HZ); entry->timer.function = lec_arp_expire_arp; add_timer(&entry->timer); found = priv->mcast_vcc; } out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return found; } static int lec_addr_delete(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long permanent) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; pr_debug("\n"); spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) && (permanent || !(entry->flags & LEC_PERMANENT_FLAG))) { lec_arp_remove(priv, entry); lec_arp_put(entry); } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return 0; } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return -1; } /* * Notifies: Response to arp_request (atm_addr != NULL) */ static void lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, const unsigned char *atm_addr, unsigned long remoteflag, unsigned int targetless_le_arp) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry, *tmp; int i; pr_debug("%smac:%pM\n", (targetless_le_arp) ? "targetless " : "", mac_addr); spin_lock_irqsave(&priv->lec_arp_lock, flags); entry = lec_arp_find(priv, mac_addr); if (entry == NULL && targetless_le_arp) goto out; /* * LANE2: ignore targetless LE_ARPs for which * we have no entry in the cache. 7.1.30 */ if (!hlist_empty(&priv->lec_arp_empty_ones)) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) { hlist_del(&entry->next); timer_delete(&entry->timer); tmp = lec_arp_find(priv, mac_addr); if (tmp) { timer_delete(&tmp->timer); tmp->status = ESI_FORWARD_DIRECT; memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN); tmp->vcc = entry->vcc; tmp->old_push = entry->old_push; tmp->last_used = jiffies; timer_delete(&entry->timer); lec_arp_put(entry); entry = tmp; } else { entry->status = ESI_FORWARD_DIRECT; ether_addr_copy(entry->mac_addr, mac_addr); entry->last_used = jiffies; lec_arp_add(priv, entry); } if (remoteflag) entry->flags |= LEC_REMOTE_FLAG; else entry->flags &= ~LEC_REMOTE_FLAG; pr_debug("After update\n"); dump_arp_table(priv); goto out; } } } entry = lec_arp_find(priv, mac_addr); if (!entry) { entry = make_entry(priv, mac_addr); if (!entry) goto out; entry->status = ESI_UNKNOWN; lec_arp_add(priv, entry); /* Temporary, changes before end of function */ } memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); timer_delete(&entry->timer); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(tmp, &priv->lec_arp_tables[i], next) { if (entry != tmp && !memcmp(tmp->atm_addr, atm_addr, ATM_ESA_LEN)) { /* Vcc to this host exists */ if (tmp->status > ESI_VC_PENDING) { /* * ESI_FLUSH_PENDING, * ESI_FORWARD_DIRECT */ entry->vcc = tmp->vcc; entry->old_push = tmp->old_push; } entry->status = tmp->status; break; } } } if (remoteflag) entry->flags |= LEC_REMOTE_FLAG; else entry->flags &= ~LEC_REMOTE_FLAG; if (entry->status == ESI_ARP_PENDING || entry->status == ESI_UNKNOWN) { entry->status = ESI_VC_PENDING; send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL); } pr_debug("After update2\n"); dump_arp_table(priv); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } /* * Notifies: Vcc setup ready */ static void lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, struct atm_vcc *vcc, void (*old_push) (struct atm_vcc *vcc, struct sk_buff *skb)) { unsigned long flags; struct lec_arp_table *entry; int i, found_entry = 0; spin_lock_irqsave(&priv->lec_arp_lock, flags); /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */ if (ioc_data->receive == 2) { pr_debug("LEC_ARP: Attaching mcast forward\n"); #if 0 entry = lec_arp_find(priv, bus_mac); if (!entry) { pr_info("LEC_ARP: Multicast entry not found!\n"); goto out; } memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); entry->recv_vcc = vcc; entry->old_recv_push = old_push; #endif entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; timer_delete(&entry->timer); memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); entry->recv_vcc = vcc; entry->old_recv_push = old_push; hlist_add_head(&entry->next, &priv->mcast_fwds); goto out; } else if (ioc_data->receive == 1) { /* * Vcc which we don't want to make default vcc, * attach it anyway. */ pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n", ATM_ESA_LEN, ioc_data->atm_addr); entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); eth_zero_addr(entry->mac_addr); entry->recv_vcc = vcc; entry->old_recv_push = old_push; entry->status = ESI_UNKNOWN; entry->timer.expires = jiffies + priv->vcc_timeout_period; entry->timer.function = lec_arp_expire_vcc; hlist_add_head(&entry->next, &priv->lec_no_forward); add_timer(&entry->timer); dump_arp_table(priv); goto out; } pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n", ATM_ESA_LEN, ioc_data->atm_addr); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (memcmp (ioc_data->atm_addr, entry->atm_addr, ATM_ESA_LEN) == 0) { pr_debug("LEC_ARP: Attaching data direct\n"); pr_debug("Currently -> Vcc: %d, Rvcc:%d\n", entry->vcc ? entry->vcc->vci : 0, entry->recv_vcc ? entry->recv_vcc-> vci : 0); found_entry = 1; timer_delete(&entry->timer); entry->vcc = vcc; entry->old_push = old_push; if (entry->status == ESI_VC_PENDING) { if (priv->maximum_unknown_frame_count == 0) entry->status = ESI_FORWARD_DIRECT; else { entry->timestamp = jiffies; entry->status = ESI_FLUSH_PENDING; #if 0 send_to_lecd(priv, l_flush_xmt, NULL, entry->atm_addr, NULL); #endif } } else { /* * They were forming a connection * to us, and we to them. Our * ATM address is numerically lower * than theirs, so we make connection * we formed into default VCC (8.1.11). * Connection they made gets torn * down. This might confuse some * clients. Can be changed if * someone reports trouble... */ ; } } } } if (found_entry) { pr_debug("After vcc was added\n"); dump_arp_table(priv); goto out; } /* * Not found, snatch address from first data packet that arrives * from this vcc */ entry = make_entry(priv, bus_mac); if (!entry) goto out; entry->vcc = vcc; entry->old_push = old_push; memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); eth_zero_addr(entry->mac_addr); entry->status = ESI_UNKNOWN; hlist_add_head(&entry->next, &priv->lec_arp_empty_ones); entry->timer.expires = jiffies + priv->vcc_timeout_period; entry->timer.function = lec_arp_expire_vcc; add_timer(&entry->timer); pr_debug("After vcc was added\n"); dump_arp_table(priv); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) { unsigned long flags; struct lec_arp_table *entry; int i; pr_debug("%lx\n", tran_id); restart: spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (entry->flush_tran_id == tran_id && entry->status == ESI_FLUSH_PENDING) { struct sk_buff *skb; struct atm_vcc *vcc = entry->vcc; lec_arp_hold(entry); spin_unlock_irqrestore(&priv->lec_arp_lock, flags); while ((skb = skb_dequeue(&entry->tx_wait))) lec_send(vcc, skb); entry->last_used = jiffies; entry->status = ESI_FORWARD_DIRECT; lec_arp_put(entry); pr_debug("LEC_ARP: Flushed\n"); goto restart; } } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); dump_arp_table(priv); } static void lec_set_flush_tran_id(struct lec_priv *priv, const unsigned char *atm_addr, unsigned long tran_id) { unsigned long flags; struct lec_arp_table *entry; int i; spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) hlist_for_each_entry(entry, &priv->lec_arp_tables[i], next) { if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) { entry->flush_tran_id = tran_id; pr_debug("Set flush transaction id to %lx for %p\n", tran_id, entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc) { unsigned long flags; unsigned char mac_addr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; struct lec_arp_table *to_add; struct lec_vcc_priv *vpriv; int err = 0; vpriv = kmalloc_obj(struct lec_vcc_priv); if (!vpriv) return -ENOMEM; vpriv->xoff = 0; vpriv->old_pop = vcc->pop; vcc->user_back = vpriv; vcc->pop = lec_pop; spin_lock_irqsave(&priv->lec_arp_lock, flags); to_add = make_entry(priv, mac_addr); if (!to_add) { vcc->pop = vpriv->old_pop; kfree(vpriv); err = -ENOMEM; goto out; } memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN); to_add->status = ESI_FORWARD_DIRECT; to_add->flags |= LEC_PERMANENT_FLAG; to_add->vcc = vcc; to_add->old_push = vcc->push; vcc->push = lec_push; priv->mcast_vcc = vcc; lec_arp_add(priv, to_add); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); return err; } static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry; int i; pr_debug("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n", vcc->vpi, vcc->vci); dump_arp_table(priv); spin_lock_irqsave(&priv->lec_arp_lock, flags); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry_safe(entry, next, &priv->lec_arp_tables[i], next) { if (vcc == entry->vcc) { lec_arp_remove(priv, entry); lec_arp_put(entry); if (priv->mcast_vcc == vcc) priv->mcast_vcc = NULL; } } } hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (entry->vcc == vcc) { lec_arp_clear_vccs(entry); timer_delete(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } } hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); timer_delete(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } } hlist_for_each_entry_safe(entry, next, &priv->mcast_fwds, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ hlist_del(&entry->next); lec_arp_put(entry); } } spin_unlock_irqrestore(&priv->lec_arp_lock, flags); dump_arp_table(priv); } static void lec_arp_check_empties(struct lec_priv *priv, struct atm_vcc *vcc, struct sk_buff *skb) { unsigned long flags; struct hlist_node *next; struct lec_arp_table *entry, *tmp; struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data; unsigned char *src = hdr->h_source; spin_lock_irqsave(&priv->lec_arp_lock, flags); hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (vcc == entry->vcc) { timer_delete(&entry->timer); ether_addr_copy(entry->mac_addr, src); entry->status = ESI_FORWARD_DIRECT; entry->last_used = jiffies; /* We might have got an entry */ tmp = lec_arp_find(priv, src); if (tmp) { lec_arp_remove(priv, tmp); lec_arp_put(tmp); } hlist_del(&entry->next); lec_arp_add(priv, entry); goto out; } } pr_debug("LEC_ARP: Arp_check_empties: entry not found!\n"); out: spin_unlock_irqrestore(&priv->lec_arp_lock, flags); } MODULE_DESCRIPTION("ATM LAN Emulation (LANE) support"); MODULE_LICENSE("GPL");
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/math64.h> #include <linux/ktime.h> #include <linux/ceph/libceph.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> #include "super.h" #ifdef CONFIG_DEBUG_FS #include "mds_client.h" #include "metric.h" static int mdsmap_show(struct seq_file *s, void *p) { int i; struct ceph_fs_client *fsc = s->private; struct ceph_mdsmap *mdsmap; if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; mdsmap = fsc->mdsc->mdsmap; seq_printf(s, "epoch %d\n", mdsmap->m_epoch); seq_printf(s, "root %d\n", mdsmap->m_root); seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); for (i = 0; i < mdsmap->possible_max_rank; i++) { struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, ceph_pr_addr(addr), ceph_mds_state_name(state)); } return 0; } /* * mdsc debugfs */ static int mdsc_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct rb_node *rp; char *path; mutex_lock(&mdsc->mutex); for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { req = rb_entry(rp, struct ceph_mds_request, r_node); if (req->r_request && req->r_session) seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_session->s_mds); else if (!req->r_request) seq_printf(s, "%lld\t(no request)\t", req->r_tid); else seq_printf(s, "%lld\t(no session)\t", req->r_tid); seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) seq_puts(s, "\t(unsafe)"); else seq_puts(s, "\t"); if (req->r_inode) { seq_printf(s, " #%llx", ceph_ino(req->r_inode)); } else if (req->r_dentry) { struct ceph_path_info path_info = {0}; path = ceph_mdsc_build_path(mdsc, req->r_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%pd (%s)", ceph_ino(d_inode(req->r_dentry->d_parent)), req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); ceph_mdsc_free_path_info(&path_info); } else if (req->r_path1) { seq_printf(s, " #%llx/%s", req->r_ino1.ino, req->r_path1); } else { seq_printf(s, " #%llx", req->r_ino1.ino); } if (req->r_old_dentry) { struct ceph_path_info path_info = {0}; path = ceph_mdsc_build_path(mdsc, req->r_old_dentry, &path_info, 0); if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%pd (%s)", req->r_old_dentry_dir ? ceph_ino(req->r_old_dentry_dir) : 0, req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); ceph_mdsc_free_path_info(&path_info); } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); else seq_printf(s, " %s", req->r_path2); } seq_puts(s, "\n"); } mutex_unlock(&mdsc->mutex); return 0; } #define CEPH_LAT_METRIC_SHOW(name, total, avg, min, max, sq) { \ s64 _total, _avg, _min, _max, _sq, _st; \ _avg = ktime_to_us(avg); \ _min = ktime_to_us(min == KTIME_MAX ? 0 : min); \ _max = ktime_to_us(max); \ _total = total - 1; \ _sq = _total > 0 ? DIV64_U64_ROUND_CLOSEST(sq, _total) : 0; \ _st = int_sqrt64(_sq); \ _st = ktime_to_us(_st); \ seq_printf(s, "%-14s%-12lld%-16lld%-16lld%-16lld%lld\n", \ name, total, _avg, _min, _max, _st); \ } #define CEPH_SZ_METRIC_SHOW(name, total, avg, min, max, sum) { \ u64 _min = min == U64_MAX ? 0 : min; \ seq_printf(s, "%-14s%-12lld%-16llu%-16llu%-16llu%llu\n", \ name, total, avg, _min, max, sum); \ } static int metrics_file_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *m = &fsc->mdsc->metric; seq_printf(s, "item total\n"); seq_printf(s, "------------------------------------------\n"); seq_printf(s, "%-35s%lld\n", "total inodes", percpu_counter_sum(&m->total_inodes)); seq_printf(s, "%-35s%lld\n", "opened files", atomic64_read(&m->opened_files)); seq_printf(s, "%-35s%lld\n", "pinned i_caps", atomic64_read(&m->total_caps)); seq_printf(s, "%-35s%lld\n", "opened inodes", percpu_counter_sum(&m->opened_inodes)); return 0; } static const char * const metric_str[] = { "read", "write", "metadata", "copyfrom" }; static int metrics_latency_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; s64 total, avg, min, max, sq; int i; seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); for (i = 0; i < METRIC_MAX; i++) { m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; avg = m->latency_avg; min = m->latency_min; max = m->latency_max; sq = m->latency_sq_sum; spin_unlock(&m->lock); CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq); } return 0; } static int metrics_size_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *cm = &fsc->mdsc->metric; struct ceph_metric *m; s64 total; u64 sum, avg, min, max; int i; seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n"); seq_printf(s, "----------------------------------------------------------------------------------------\n"); for (i = 0; i < METRIC_MAX; i++) { /* skip 'metadata' as it doesn't use the size metric */ if (i == METRIC_METADATA) continue; m = &cm->metric[i]; spin_lock(&m->lock); total = m->total; sum = m->size_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->size_min; max = m->size_max; spin_unlock(&m->lock); CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum); } return 0; } static int metrics_caps_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_client_metric *m = &fsc->mdsc->metric; int nr_caps = 0; seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); seq_printf(s, "%-14s%-16lld%-16lld%lld\n", "d_lease", atomic64_read(&m->total_dentries), percpu_counter_sum(&m->d_lease_mis), percpu_counter_sum(&m->d_lease_hit)); nr_caps = atomic64_read(&m->total_caps); seq_printf(s, "%-14s%-16d%-16lld%lld\n", "caps", nr_caps, percpu_counter_sum(&m->i_caps_mis), percpu_counter_sum(&m->i_caps_hit)); return 0; } static int caps_show_cb(struct inode *inode, int mds, void *p) { struct ceph_inode_info *ci = ceph_inode(inode); struct seq_file *s = p; struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (cap) seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), cap->session->s_mds, ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented)); spin_unlock(&ci->i_ceph_lock); return 0; } static int caps_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; int total, avail, used, reserved, min, i; struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" "avail\t\t%d\n" "used\t\t%d\n" "reserved\t%d\n" "min\t\t%d\n\n", total, avail, used, reserved, min); seq_printf(s, "ino mds issued implemented\n"); seq_printf(s, "--------------------------------------------------\n"); mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *session; session = __ceph_lookup_mds_session(mdsc, i); if (!session) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); ceph_iterate_session_caps(session, caps_show_cb, s); mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); seq_printf(s, "\n\nWaiters:\n--------\n"); seq_printf(s, "tgid ino need want\n"); seq_printf(s, "-----------------------------------------------------\n"); spin_lock(&mdsc->caps_list_lock); list_for_each_entry(cw, &mdsc->cap_wait_list, list) { seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino, ceph_cap_string(cw->need), ceph_cap_string(cw->want)); } spin_unlock(&mdsc->caps_list_lock); return 0; } static int mds_sessions_show(struct seq_file *s, void *ptr) { struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_auth_client *ac = fsc->client->monc.auth; struct ceph_options *opt = fsc->client->options; int mds; mutex_lock(&mdsc->mutex); /* The 'num' portion of an 'entity name' */ seq_printf(s, "global_id %llu\n", ac->global_id); /* The -o name mount argument */ seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : ""); /* The list of MDS session rank+state */ for (mds = 0; mds < mdsc->max_sessions; mds++) { struct ceph_mds_session *session = __ceph_lookup_mds_session(mdsc, mds); if (!session) { continue; } mutex_unlock(&mdsc->mutex); seq_printf(s, "mds.%d %s\n", session->s_mds, ceph_session_state_name(session->s_state)); ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); return 0; } static int status_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; struct ceph_entity_inst *inst = &fsc->client->msgr.inst; struct ceph_entity_addr *client_addr = ceph_client_addr(fsc->client); seq_printf(s, "instance: %s.%lld %s/%u\n", ENTITY_NAME(inst->name), ceph_pr_addr(client_addr), le32_to_cpu(client_addr->nonce)); seq_printf(s, "blocklisted: %s\n", str_true_false(fsc->blocklisted)); return 0; } DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); DEFINE_SHOW_ATTRIBUTE(status); DEFINE_SHOW_ATTRIBUTE(metrics_file); DEFINE_SHOW_ATTRIBUTE(metrics_latency); DEFINE_SHOW_ATTRIBUTE(metrics_size); DEFINE_SHOW_ATTRIBUTE(metrics_caps); /* * debugfs */ static int congestion_kb_set(void *data, u64 val) { struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; fsc->mount_options->congestion_kb = (int)val; return 0; } static int congestion_kb_get(void *data, u64 *val) { struct ceph_fs_client *fsc = (struct ceph_fs_client *)data; *val = (u64)fsc->mount_options->congestion_kb; return 0; } DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, congestion_kb_set, "%llu\n"); void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { doutc(fsc->client, "begin\n"); debugfs_remove(fsc->debugfs_bdi); debugfs_remove(fsc->debugfs_congestion_kb); debugfs_remove(fsc->debugfs_mdsmap); debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_status); debugfs_remove(fsc->debugfs_mdsc); debugfs_remove_recursive(fsc->debugfs_metrics_dir); doutc(fsc->client, "done\n"); } void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { char name[NAME_MAX]; doutc(fsc->client, "begin\n"); fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, fsc->client->debugfs_dir, fsc, &congestion_kb_fops); snprintf(name, sizeof(name), "../../bdi/%s", bdi_dev_name(fsc->sb->s_bdi)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, name); fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 0400, fsc->client->debugfs_dir, fsc, &mdsmap_fops); fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", 0400, fsc->client->debugfs_dir, fsc, &mds_sessions_fops); fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0400, fsc->client->debugfs_dir, fsc, &mdsc_fops); fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, fsc, &caps_fops); fsc->debugfs_status = debugfs_create_file("status", 0400, fsc->client->debugfs_dir, fsc, &status_fops); fsc->debugfs_metrics_dir = debugfs_create_dir("metrics", fsc->client->debugfs_dir); debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_file_fops); debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_latency_fops); debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_size_fops); debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, &metrics_caps_fops); doutc(fsc->client, "done\n"); } #else /* CONFIG_DEBUG_FS */ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) { } #endif /* CONFIG_DEBUG_FS */
4 4 12 11 11 12 11 11 1 25 1 25 1 25 25 25 25 5 22 24 25 2 24 22 6 25 21 9 1 8 8 1 7 8 9 9 8 1 18 21 21 21 21 7 18 20 415 374 44 4 2 4 1 1 3 1 2 1 2 1 2 2 4 19 15 15 7 7 7 17 4 15 17 14 3 3 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 /* * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Daryll Strauss <daryll@valinux.com> * \author Gareth Hughes <gareth@valinux.com> */ /* * Created: Mon Jan 4 08:58:31 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/anon_inodes.h> #include <linux/dma-fence.h> #include <linux/export.h> #include <linux/file.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/vga_switcheroo.h> #include <drm/drm_client_event.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include <drm/drm_debugfs.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /* from BKL pushdown */ DEFINE_MUTEX(drm_global_mutex); bool drm_dev_needs_global_mutex(struct drm_device *dev) { /* * The deprecated ->load callback must be called after the driver is * already registered. This means such drivers rely on the BKL to make * sure an open can't proceed until the driver is actually fully set up. * Similar hilarity holds for the unload callback. */ if (dev->driver->load || dev->driver->unload) return true; return false; } /** * DOC: file operations * * Drivers must define the file operations structure that forms the DRM * userspace API entry point, even though most of those operations are * implemented in the DRM core. The resulting &struct file_operations must be * stored in the &drm_driver.fops field. The mandatory functions are drm_open(), * drm_read(), drm_ioctl() and drm_compat_ioctl() if CONFIG_COMPAT is enabled * Note that drm_compat_ioctl will be NULL if CONFIG_COMPAT=n, so there's no * need to sprinkle #ifdef into the code. Drivers which implement private ioctls * that require 32/64 bit compatibility support must provide their own * &file_operations.compat_ioctl handler that processes private ioctls and calls * drm_compat_ioctl() for core ioctls. * * In addition drm_read() and drm_poll() provide support for DRM events. DRM * events are a generic and extensible means to send asynchronous events to * userspace through the file descriptor. They are used to send vblank event and * page flip completions by the KMS API. But drivers can also use it for their * own needs, e.g. to signal completion of rendering. * * For the driver-side event interface see drm_event_reserve_init() and * drm_send_event() as the main starting points. * * The memory mapping implementation will vary depending on how the driver * manages memory. For GEM-based drivers this is drm_gem_mmap(). * * No other file operations are supported by the DRM userspace API. Overall the * following is an example &file_operations structure:: * * static const example_drm_fops = { * .owner = THIS_MODULE, * .open = drm_open, * .release = drm_release, * .unlocked_ioctl = drm_ioctl, * .compat_ioctl = drm_compat_ioctl, // NULL if CONFIG_COMPAT=n * .poll = drm_poll, * .read = drm_read, * .mmap = drm_gem_mmap, * }; * * For plain GEM based drivers there is the DEFINE_DRM_GEM_FOPS() macro, and for * DMA based drivers there is the DEFINE_DRM_GEM_DMA_FOPS() macro to make this * simpler. * * The driver's &file_operations must be stored in &drm_driver.fops. * * For driver-private IOCTL handling see the more detailed discussion in * :ref:`IOCTL support in the userland interfaces chapter<drm_driver_ioctl>`. */ /** * drm_file_alloc - allocate file context * @minor: minor to allocate on * * This allocates a new DRM file context. It is not linked into any context and * can be used by the caller freely. Note that the context keeps a pointer to * @minor, so it must be freed before @minor is. * * RETURNS: * Pointer to newly allocated context, ERR_PTR on failure. */ struct drm_file *drm_file_alloc(struct drm_minor *minor) { static atomic64_t ident = ATOMIC64_INIT(0); struct drm_device *dev = minor->dev; struct drm_file *file; int ret; file = kzalloc_obj(*file); if (!file) return ERR_PTR(-ENOMEM); /* Get a unique identifier for fdinfo: */ file->client_id = atomic64_inc_return(&ident); rcu_assign_pointer(file->pid, get_pid(task_tgid(current))); file->minor = minor; /* for compatibility root is always authenticated */ file->authenticated = capable(CAP_SYS_ADMIN); INIT_LIST_HEAD(&file->lhead); INIT_LIST_HEAD(&file->fbs); mutex_init(&file->fbs_lock); INIT_LIST_HEAD(&file->blobs); INIT_LIST_HEAD(&file->pending_event_list); INIT_LIST_HEAD(&file->event_list); init_waitqueue_head(&file->event_wait); file->event_space = 4096; /* set aside 4k for event buffer */ spin_lock_init(&file->master_lookup_lock); mutex_init(&file->event_read_lock); mutex_init(&file->client_name_lock); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_open(dev, file); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_open(file); drm_prime_init_file_private(&file->prime); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_add(file); if (dev->driver->open) { ret = dev->driver->open(dev, file); if (ret < 0) goto out_prime_destroy; } return file; out_prime_destroy: drm_prime_destroy_file_private(&file->prime); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_remove(file); put_pid(rcu_access_pointer(file->pid)); kfree(file); return ERR_PTR(ret); } static void drm_events_release(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; struct drm_pending_event *e, *et; unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); /* Unlink pending events */ list_for_each_entry_safe(e, et, &file_priv->pending_event_list, pending_link) { list_del(&e->pending_link); e->file_priv = NULL; } /* Remove unconsumed events */ list_for_each_entry_safe(e, et, &file_priv->event_list, link) { list_del(&e->link); kfree(e); } spin_unlock_irqrestore(&dev->event_lock, flags); } /** * drm_file_free - free file context * @file: context to free, or NULL * * This destroys and deallocates a DRM file context previously allocated via * drm_file_alloc(). The caller must make sure to unlink it from any contexts * before calling this. * * If NULL is passed, this is a no-op. */ void drm_file_free(struct drm_file *file) { struct drm_device *dev; if (!file) return; dev = file->minor->dev; drm_dbg_core(dev, "comm=\"%s\", pid=%d, dev=0x%lx, open_count=%d\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file->minor->kdev->devt), atomic_read(&dev->open_count)); if (!drm_core_check_feature(dev, DRIVER_COMPUTE_ACCEL)) drm_debugfs_clients_remove(file); drm_events_release(file); if (drm_core_check_feature(dev, DRIVER_MODESET)) { drm_fb_release(file); drm_property_destroy_user_blobs(dev, file); } if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); if (drm_is_primary_client(file)) drm_master_release(file); if (dev->driver->postclose) dev->driver->postclose(dev, file); drm_prime_destroy_file_private(&file->prime); WARN_ON(!list_empty(&file->event_list)); put_pid(rcu_access_pointer(file->pid)); mutex_destroy(&file->client_name_lock); kfree(file->client_name); kfree(file); } static void drm_close_helper(struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; mutex_lock(&dev->filelist_mutex); list_del(&file_priv->lhead); mutex_unlock(&dev->filelist_mutex); drm_file_free(file_priv); } /* * Check whether DRI will run on this CPU. * * \return non-zero if the DRI will run on this CPU, or zero otherwise. */ static int drm_cpu_valid(void) { #if defined(__sparc__) && !defined(__sparc_v9__) return 0; /* No cmpxchg before v9 sparc. */ #endif return 1; } /* * Called whenever a process opens a drm node * * \param filp file pointer. * \param minor acquired minor-object. * \return zero on success or a negative number on failure. * * Creates and initializes a drm_file structure for the file private data in \p * filp and add it into the double linked list in \p dev. */ int drm_open_helper(struct file *filp, struct drm_minor *minor) { struct drm_device *dev = minor->dev; struct drm_file *priv; int ret; if (filp->f_flags & O_EXCL) return -EBUSY; /* No exclusive opens */ if (!drm_cpu_valid()) return -EINVAL; if (dev->switch_power_state != DRM_SWITCH_POWER_ON && dev->switch_power_state != DRM_SWITCH_POWER_DYNAMIC_OFF) return -EINVAL; if (WARN_ON_ONCE(!(filp->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) return -EINVAL; drm_dbg_core(dev, "comm=\"%s\", pid=%d, minor=%d\n", current->comm, task_pid_nr(current), minor->index); priv = drm_file_alloc(minor); if (IS_ERR(priv)) return PTR_ERR(priv); if (drm_is_primary_client(priv)) { ret = drm_master_open(priv); if (ret) { drm_file_free(priv); return ret; } } filp->private_data = priv; priv->filp = filp; mutex_lock(&dev->filelist_mutex); list_add(&priv->lhead, &dev->filelist); mutex_unlock(&dev->filelist_mutex); return 0; } /** * drm_open - open method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.open method. * It looks up the correct DRM device and instantiates all the per-file * resources for it. It also calls the &drm_driver.open driver callback. * * RETURNS: * 0 on success or negative errno value on failure. */ int drm_open(struct inode *inode, struct file *filp) { struct drm_device *dev; struct drm_minor *minor; int retcode; minor = drm_minor_acquire(&drm_minors_xa, iminor(inode)); if (IS_ERR(minor)) return PTR_ERR(minor); dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); atomic_fetch_inc(&dev->open_count); /* share address_space across all char-devs of a single device */ filp->f_mapping = dev->anon_inode->i_mapping; retcode = drm_open_helper(filp, minor); if (retcode) goto err_undo; if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return 0; err_undo: atomic_dec(&dev->open_count); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return retcode; } EXPORT_SYMBOL(drm_open); static void drm_lastclose(struct drm_device *dev) { drm_client_dev_restore(dev, false); if (dev_is_pci(dev->dev)) vga_switcheroo_process_delayed_switch(); } /** * drm_release - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file. If this * is the last open file for the DRM device, it also restores the active * in-kernel DRM client. * * RETURNS: * Always succeeds and returns 0. */ int drm_release(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); drm_dbg_core(dev, "open_count = %d\n", atomic_read(&dev->open_count)); drm_close_helper(filp); if (atomic_dec_and_test(&dev->open_count)) drm_lastclose(dev); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release); void drm_file_update_pid(struct drm_file *filp) { struct drm_device *dev; struct pid *pid, *old; /* * Master nodes need to keep the original ownership in order for * drm_master_check_perm to keep working correctly. (See comment in * drm_auth.c.) */ if (filp->was_master) return; pid = task_tgid(current); /* * Quick unlocked check since the model is a single handover followed by * exclusive repeated use. */ if (pid == rcu_access_pointer(filp->pid)) return; dev = filp->minor->dev; mutex_lock(&dev->filelist_mutex); get_pid(pid); old = rcu_replace_pointer(filp->pid, pid, 1); mutex_unlock(&dev->filelist_mutex); synchronize_rcu(); put_pid(old); } /** * drm_release_noglobal - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function may be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file prior to taking * the drm_global_mutex. If this is the last open file for the DRM device, it * then restores the active in-kernel DRM client. * * RETURNS: * Always succeeds and returns 0. */ int drm_release_noglobal(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; drm_close_helper(filp); if (atomic_dec_and_mutex_lock(&dev->open_count, &drm_global_mutex)) { drm_lastclose(dev); mutex_unlock(&drm_global_mutex); } drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release_noglobal); /** * drm_read - read method for DRM file * @filp: file pointer * @buffer: userspace destination pointer for the read * @count: count in bytes to read * @offset: offset to read * * This function must be used by drivers as their &file_operations.read * method if they use DRM events for asynchronous signalling to userspace. * Since events are used by the KMS API for vblank and page flip completion this * means all modern display drivers must use it. * * @offset is ignored, DRM events are read like a pipe. Polling support is * provided by drm_poll(). * * This function will only ever read a full event. Therefore userspace must * supply a big enough buffer to fit any event to ensure forward progress. Since * the maximum event space is currently 4K it's recommended to just use that for * safety. * * RETURNS: * Number of bytes read (always aligned to full events, and can be 0) or a * negative error code on failure. */ ssize_t drm_read(struct file *filp, char __user *buffer, size_t count, loff_t *offset) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; ssize_t ret; ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; for (;;) { struct drm_pending_event *e = NULL; spin_lock_irq(&dev->event_lock); if (!list_empty(&file_priv->event_list)) { e = list_first_entry(&file_priv->event_list, struct drm_pending_event, link); file_priv->event_space += e->event->length; list_del(&e->link); } spin_unlock_irq(&dev->event_lock); if (e == NULL) { if (ret) break; if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } mutex_unlock(&file_priv->event_read_lock); ret = wait_event_interruptible(file_priv->event_wait, !list_empty(&file_priv->event_list)); if (ret >= 0) ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; } else { unsigned length = e->event->length; if (length > count - ret) { put_back_event: spin_lock_irq(&dev->event_lock); file_priv->event_space -= length; list_add(&e->link, &file_priv->event_list); spin_unlock_irq(&dev->event_lock); wake_up_interruptible_poll(&file_priv->event_wait, EPOLLIN | EPOLLRDNORM); break; } if (copy_to_user(buffer + ret, e->event, length)) { if (ret == 0) ret = -EFAULT; goto put_back_event; } ret += length; kfree(e); } } mutex_unlock(&file_priv->event_read_lock); return ret; } EXPORT_SYMBOL(drm_read); /** * drm_poll - poll method for DRM file * @filp: file pointer * @wait: poll waiter table * * This function must be used by drivers as their &file_operations.read method * if they use DRM events for asynchronous signalling to userspace. Since * events are used by the KMS API for vblank and page flip completion this means * all modern display drivers must use it. * * See also drm_read(). * * RETURNS: * Mask of POLL flags indicating the current status of the file. */ __poll_t drm_poll(struct file *filp, struct poll_table_struct *wait) { struct drm_file *file_priv = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file_priv->event_wait, wait); if (!list_empty(&file_priv->event_list)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_SYMBOL(drm_poll); /** * drm_event_reserve_init_locked - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * This is the locked version of drm_event_reserve_init() for callers which * already hold &drm_device.event_lock. * * RETURNS: * 0 on success or a negative error code on failure. */ int drm_event_reserve_init_locked(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { if (file_priv->event_space < e->length) return -ENOMEM; file_priv->event_space -= e->length; p->event = e; list_add(&p->pending_link, &file_priv->pending_event_list); p->file_priv = file_priv; return 0; } EXPORT_SYMBOL(drm_event_reserve_init_locked); /** * drm_event_reserve_init - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * Callers which already hold &drm_device.event_lock should use * drm_event_reserve_init_locked() instead. * * RETURNS: * 0 on success or a negative error code on failure. */ int drm_event_reserve_init(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { unsigned long flags; int ret; spin_lock_irqsave(&dev->event_lock, flags); ret = drm_event_reserve_init_locked(dev, file_priv, p, e); spin_unlock_irqrestore(&dev->event_lock, flags); return ret; } EXPORT_SYMBOL(drm_event_reserve_init); /** * drm_event_cancel_free - free a DRM event and release its space * @dev: DRM device * @p: tracking structure for the pending event * * This function frees the event @p initialized with drm_event_reserve_init() * and releases any allocated space. It is used to cancel an event when the * nonblocking operation could not be submitted and needed to be aborted. */ void drm_event_cancel_free(struct drm_device *dev, struct drm_pending_event *p) { unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); if (p->file_priv) { p->file_priv->event_space += p->event->length; list_del(&p->pending_link); } spin_unlock_irqrestore(&dev->event_lock, flags); if (p->fence) dma_fence_put(p->fence); kfree(p); } EXPORT_SYMBOL(drm_event_cancel_free); static void drm_send_event_helper(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { assert_spin_locked(&dev->event_lock); if (e->completion) { complete_all(e->completion); e->completion_release(e->completion); e->completion = NULL; } if (e->fence) { if (timestamp) dma_fence_signal_timestamp(e->fence, timestamp); else dma_fence_signal(e->fence); dma_fence_put(e->fence); } if (!e->file_priv) { kfree(e); return; } list_del(&e->pending_link); list_add_tail(&e->link, &e->file_priv->event_list); wake_up_interruptible_poll(&e->file_priv->event_wait, EPOLLIN | EPOLLRDNORM); } /** * drm_send_event_timestamp_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC * time domain * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_timestamp_locked(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { drm_send_event_helper(dev, e, timestamp); } EXPORT_SYMBOL(drm_send_event_timestamp_locked); /** * drm_send_event_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock, see drm_send_event() for the unlocked version. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e) { drm_send_event_helper(dev, e, 0); } EXPORT_SYMBOL(drm_send_event_locked); /** * drm_send_event - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. This function acquires * &drm_device.event_lock, see drm_send_event_locked() for callers which already * hold this lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) { unsigned long irqflags; spin_lock_irqsave(&dev->event_lock, irqflags); drm_send_event_helper(dev, e, 0); spin_unlock_irqrestore(&dev->event_lock, irqflags); } EXPORT_SYMBOL(drm_send_event); void drm_fdinfo_print_size(struct drm_printer *p, const char *prefix, const char *stat, const char *region, u64 sz) { const char *units[] = {"", " KiB", " MiB"}; unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { if (sz == 0 || !IS_ALIGNED(sz, SZ_1K)) break; sz = div_u64(sz, SZ_1K); } drm_printf(p, "%s-%s-%s:\t%llu%s\n", prefix, stat, region, sz, units[u]); } EXPORT_SYMBOL(drm_fdinfo_print_size); int drm_memory_stats_is_zero(const struct drm_memory_stats *stats) { return (stats->shared == 0 && stats->private == 0 && stats->resident == 0 && stats->purgeable == 0 && stats->active == 0); } EXPORT_SYMBOL(drm_memory_stats_is_zero); /** * drm_print_memory_stats - A helper to print memory stats * @p: The printer to print output to * @stats: The collected memory stats * @supported_status: Bitmask of optional stats which are available * @region: The memory region * */ void drm_print_memory_stats(struct drm_printer *p, const struct drm_memory_stats *stats, enum drm_gem_object_status supported_status, const char *region) { const char *prefix = "drm"; drm_fdinfo_print_size(p, prefix, "total", region, stats->private + stats->shared); drm_fdinfo_print_size(p, prefix, "shared", region, stats->shared); if (supported_status & DRM_GEM_OBJECT_ACTIVE) drm_fdinfo_print_size(p, prefix, "active", region, stats->active); if (supported_status & DRM_GEM_OBJECT_RESIDENT) drm_fdinfo_print_size(p, prefix, "resident", region, stats->resident); if (supported_status & DRM_GEM_OBJECT_PURGEABLE) drm_fdinfo_print_size(p, prefix, "purgeable", region, stats->purgeable); } EXPORT_SYMBOL(drm_print_memory_stats); /** * drm_show_memory_stats - Helper to collect and show standard fdinfo memory stats * @p: the printer to print output to * @file: the DRM file * * Helper to iterate over GEM objects with a handle allocated in the specified * file. */ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file) { struct drm_gem_object *obj; struct drm_memory_stats status = {}; enum drm_gem_object_status supported_status = 0; int id; spin_lock(&file->table_lock); idr_for_each_entry (&file->object_idr, obj, id) { enum drm_gem_object_status s = 0; size_t add_size = (obj->funcs && obj->funcs->rss) ? obj->funcs->rss(obj) : obj->size; if (obj->funcs && obj->funcs->status) { s = obj->funcs->status(obj); supported_status |= s; } if (drm_gem_object_is_shared_for_memory_stats(obj)) status.shared += obj->size; else status.private += obj->size; if (s & DRM_GEM_OBJECT_RESIDENT) { status.resident += add_size; } else { /* If already purged or not yet backed by pages, don't * count it as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) { status.active += add_size; supported_status |= DRM_GEM_OBJECT_ACTIVE; /* If still active, don't count as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (s & DRM_GEM_OBJECT_PURGEABLE) status.purgeable += add_size; } spin_unlock(&file->table_lock); drm_print_memory_stats(p, &status, supported_status, "memory"); } EXPORT_SYMBOL(drm_show_memory_stats); /** * drm_show_fdinfo - helper for drm file fops * @m: output stream * @f: the device file instance * * Helper to implement fdinfo, for userspace to query usage stats, etc, of a * process using the GPU. See also &drm_driver.show_fdinfo. * * For text output format description please see Documentation/gpu/drm-usage-stats.rst */ void drm_show_fdinfo(struct seq_file *m, struct file *f) { struct drm_file *file = f->private_data; struct drm_device *dev = file->minor->dev; struct drm_printer p = drm_seq_file_printer(m); int idx; if (!drm_dev_enter(dev, &idx)) return; drm_printf(&p, "drm-driver:\t%s\n", dev->driver->name); drm_printf(&p, "drm-client-id:\t%llu\n", file->client_id); if (dev_is_pci(dev->dev)) { struct pci_dev *pdev = to_pci_dev(dev->dev); drm_printf(&p, "drm-pdev:\t%04x:%02x:%02x.%d\n", pci_domain_nr(pdev->bus), pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); } mutex_lock(&file->client_name_lock); if (file->client_name) drm_printf(&p, "drm-client-name:\t%s\n", file->client_name); mutex_unlock(&file->client_name_lock); if (dev->driver->show_fdinfo) dev->driver->show_fdinfo(&p, file); drm_dev_exit(idx); } EXPORT_SYMBOL(drm_show_fdinfo); /** * drm_file_err - log process name, pid and client_name associated with a drm_file * @file_priv: context of interest for process name and pid * @fmt: printf() like format string * * Helper function for clients which needs to log process details such * as name and pid etc along with user logs. */ void drm_file_err(struct drm_file *file_priv, const char *fmt, ...) { va_list args; struct va_format vaf; struct pid *pid; struct task_struct *task; struct drm_device *dev = file_priv->minor->dev; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; mutex_lock(&file_priv->client_name_lock); rcu_read_lock(); pid = rcu_dereference(file_priv->pid); task = pid_task(pid, PIDTYPE_TGID); drm_err(dev, "comm: %s pid: %d client-id:%llu client: %s ... %pV", task ? task->comm : "Unset", task ? task->pid : 0, file_priv->client_id, file_priv->client_name ?: "Unset", &vaf); va_end(args); rcu_read_unlock(); mutex_unlock(&file_priv->client_name_lock); } EXPORT_SYMBOL(drm_file_err); /** * mock_drm_getfile - Create a new struct file for the drm device * @minor: drm minor to wrap (e.g. #drm_device.primary) * @flags: file creation mode (O_RDWR etc) * * This create a new struct file that wraps a DRM file context around a * DRM minor. This mimicks userspace opening e.g. /dev/dri/card0, but without * invoking userspace. The struct file may be operated on using its f_op * (the drm_device.driver.fops) to mimick userspace operations, or be supplied * to userspace facing functions as an internal/anonymous client. * * RETURNS: * Pointer to newly created struct file, ERR_PTR on failure. */ struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags) { struct drm_device *dev = minor->dev; struct drm_file *priv; struct file *file; priv = drm_file_alloc(minor); if (IS_ERR(priv)) return ERR_CAST(priv); file = anon_inode_getfile("drm", dev->driver->fops, priv, flags); if (IS_ERR(file)) { drm_file_free(priv); return file; } /* Everyone shares a single global address space */ file->f_mapping = dev->anon_inode->i_mapping; drm_dev_get(dev); priv->filp = file; return file; } EXPORT_SYMBOL_FOR_TESTS_ONLY(mock_drm_getfile);
134 135 134 7 17 113 112 113 13 111 113 11 34 10 43 4 4 2 2 1 49 48 33 16 52 30 44 52 52 10 51 2 2 32 23 10 22 143 5 5 5 40 130 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_dfl_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc_flex(*p, data, 2 * size, GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_dfl_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_dfl_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_dfl_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 // SPDX-License-Identifier: GPL-2.0-only /* * * YeAH TCP * * For further details look at: * https://web.archive.org/web/20080316215752/http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf * */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> #include "tcp_vegas.h" #define TCP_YEAH_ALPHA 80 /* number of packets queued at the bottleneck */ #define TCP_YEAH_GAMMA 1 /* fraction of queue to be removed per rtt */ #define TCP_YEAH_DELTA 3 /* log minimum fraction of cwnd to be removed on loss */ #define TCP_YEAH_EPSILON 1 /* log maximum fraction to be removed on early decongestion */ #define TCP_YEAH_PHY 8 /* maximum delta from base */ #define TCP_YEAH_RHO 16 /* minimum number of consecutive rtt to consider competition on loss */ #define TCP_YEAH_ZETA 50 /* minimum number of state switches to reset reno_count */ #define TCP_SCALABLE_AI_CNT 100U /* YeAH variables */ struct yeah { struct vegas vegas; /* must be first */ /* YeAH */ u32 lastQ; u32 doing_reno_now; u32 reno_count; u32 fast_count; }; static void tcp_yeah_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); tcp_vegas_init(sk); yeah->doing_reno_now = 0; yeah->lastQ = 0; yeah->reno_count = 2; /* Ensure the MD arithmetic works. This is somewhat pedantic, * since I don't think we will see a cwnd this large. :) */ tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) goto do_vegas; } if (!yeah->doing_reno_now) { /* Scalable */ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT), acked); } else { /* Reno */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. * * These are so named because they represent the approximate values * of snd_una and snd_nxt at the beginning of the current RTT. More * precisely, they represent the amount of data sent during the RTT. * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, * we will calculate that (v_beg_snd_nxt - v_vegas.beg_snd_una) outstanding * bytes of data have been ACKed during the course of the RTT, giving * an "actual" rate of: * * (v_beg_snd_nxt - v_vegas.beg_snd_una) / (rtt duration) * * Unfortunately, v_vegas.beg_snd_una is not exactly equal to snd_una, * because delayed ACKs can cover more than one segment, so they * don't line up yeahly with the boundaries of RTTs. * * Another unfortunate fact of life is that delayed ACKs delay the * advance of the left edge of our send window, so that the number * of bytes we send in an RTT is often less than our cwnd will allow. * So we keep track of our cwnd separately, in v_beg_snd_cwnd. */ do_vegas: if (after(ack, yeah->vegas.beg_snd_nxt)) { /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got * at least one RTT sample that wasn't from a delayed ACK. * If we only had 2 samples total, * then that means we're getting only 1 ACK per RTT, which * means they're almost certainly delayed ACKs. * If we have 3 samples, we should be OK. */ if (yeah->vegas.cntRTT > 2) { u32 rtt, queue; u64 bw; /* We have enough RTT samples, so, using the Vegas * algorithm, we determine if we should increase or * decrease cwnd, and by how much. */ /* Pluck out the RTT we are using for the Vegas * calculations. This is the min RTT seen during the * last RTT. Taking the min filters out the effects * of delayed ACKs, at the cost of noticing congestion * a bit later. */ rtt = yeah->vegas.minRTT; /* Compute excess number of packets above bandwidth * Avoid doing full 64 bit divide. */ bw = tcp_snd_cwnd(tp); bw *= rtt - yeah->vegas.baseRTT; do_div(bw, rtt); queue = bw; if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { if (queue > TCP_YEAH_ALPHA && tcp_snd_cwnd(tp) > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON); tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction); tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), yeah->reno_count)); tp->snd_ssthresh = tcp_snd_cwnd(tp); } if (yeah->reno_count <= 2) yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U); else yeah->reno_count++; yeah->doing_reno_now = min(yeah->doing_reno_now + 1, 0xffffffU); } else { yeah->fast_count++; if (yeah->fast_count > TCP_YEAH_ZETA) { yeah->reno_count = 2; yeah->fast_count = 0; } yeah->doing_reno_now = 0; } yeah->lastQ = queue; } /* Save the extent of the current window so we can use this * at the end of the next RTT. */ yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; yeah->vegas.beg_snd_nxt = tp->snd_nxt; yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp); /* Wipe the slate clean for the next RTT. */ yeah->vegas.cntRTT = 0; yeah->vegas.minRTT = 0x7fffffff; } } static u32 tcp_yeah_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct yeah *yeah = inet_csk_ca(sk); u32 reduction; if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U)); reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA); } else reduction = max(tcp_snd_cwnd(tp)>>1, 2U); yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); return max_t(int, tcp_snd_cwnd(tp) - reduction, 2); } static struct tcp_congestion_ops tcp_yeah __read_mostly = { .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_yeah_cong_avoid, .set_state = tcp_vegas_state, .cwnd_event = tcp_vegas_cwnd_event, .get_info = tcp_vegas_get_info, .pkts_acked = tcp_vegas_pkts_acked, .owner = THIS_MODULE, .name = "yeah", }; static int __init tcp_yeah_register(void) { BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_yeah); return 0; } static void __exit tcp_yeah_unregister(void) { tcp_unregister_congestion_control(&tcp_yeah); } module_init(tcp_yeah_register); module_exit(tcp_yeah_unregister); MODULE_AUTHOR("Angelo P. Castellani"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("YeAH TCP");
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> struct nft_dup_netdev { u8 sreg_dev; }; static void nft_dup_netdev_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_dup_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; nf_dup_netdev_egress(pkt, oif); } static const struct nla_policy nft_dup_netdev_policy[NFTA_DUP_MAX + 1] = { [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, }; static int nft_dup_netdev_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_dup_netdev *priv = nft_expr_priv(expr); if (tb[NFTA_DUP_SREG_DEV] == NULL) return -EINVAL; return nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV], &priv->sreg_dev, sizeof(int)); } static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_dup_netdev *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_dup_netdev *priv = nft_expr_priv(expr); int oif = ctx->regs[priv->sreg_dev].data.data[0]; return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) { return true; } static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_netdev)), .eval = nft_dup_netdev_eval, .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { .family = NFPROTO_NETDEV, .name = "dup", .ops = &nft_dup_netdev_ops, .policy = nft_dup_netdev_policy, .maxattr = NFTA_DUP_MAX, .owner = THIS_MODULE, }; static int __init nft_dup_netdev_module_init(void) { return nft_register_expr(&nft_dup_netdev_type); } static void __exit nft_dup_netdev_module_exit(void) { nft_unregister_expr(&nft_dup_netdev_type); } module_init(nft_dup_netdev_module_init); module_exit(nft_dup_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_AF_EXPR(5, "dup"); MODULE_DESCRIPTION("nftables netdev packet duplication support");
7 4 4 8 2 2 4 8 1 2 5 5 2 4 1 2 3 1 2 21 1 2 1 8 1 1 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/if_vlan.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> #include <net/gre.h> #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/sctp/checksum.h> static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, struct vlan_ethhdr *veth) { if (skb_copy_bits(skb, mac_off, veth, ETH_HLEN)) return false; veth->h_vlan_proto = skb->vlan_proto; veth->h_vlan_TCI = htons(skb_vlan_tag_get(skb)); veth->h_vlan_encapsulated_proto = skb->protocol; return true; } /* add vlan header into the user buffer for if tag was removed by offloads */ static bool nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; struct vlan_ethhdr veth; vlanh = (u8 *) &veth; if (offset < VLAN_ETH_HLEN) { u8 ethlen = len; if (!nft_payload_rebuild_vlan_hdr(skb, mac_off, &veth)) return false; if (offset + len > VLAN_ETH_HLEN) ethlen -= offset + len - VLAN_ETH_HLEN; memcpy(dst_u8, vlanh + offset, ethlen); len -= ethlen; if (len == 0) return true; dst_u8 += ethlen; offset = ETH_HLEN; } else { offset -= VLAN_HLEN; } return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) { unsigned int thoff = nft_thoff(pkt); if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) return -1; switch (pkt->tprot) { case IPPROTO_UDP: pkt->inneroff = thoff + sizeof(struct udphdr); break; case IPPROTO_TCP: { struct tcphdr *th, _tcph; th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph); if (!th) return -1; pkt->inneroff = thoff + __tcp_hdrlen(th); } break; case IPPROTO_GRE: { u32 offset = sizeof(struct gre_base_hdr); struct gre_base_hdr *gre, _gre; __be16 version; gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre); if (!gre) return -1; version = gre->flags & GRE_VERSION; switch (version) { case GRE_VERSION_0: if (gre->flags & GRE_ROUTING) return -1; if (gre->flags & GRE_CSUM) { offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); } if (gre->flags & GRE_KEY) offset += sizeof_field(struct gre_full_hdr, key); if (gre->flags & GRE_SEQ) offset += sizeof_field(struct gre_full_hdr, seq); break; default: return -1; } pkt->inneroff = thoff + offset; } break; case IPPROTO_IPIP: pkt->inneroff = thoff; break; default: return -1; } pkt->flags |= NFT_PKTINFO_INNER; return 0; } int nft_payload_inner_offset(const struct nft_pktinfo *pkt) { if (!(pkt->flags & NFT_PKTINFO_INNER) && __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) return -1; return pkt->inneroff; } static bool nft_payload_need_vlan_adjust(u32 offset, u32 len) { unsigned int boundary = offset + len; /* data past ether src/dst requested, copy needed */ if (boundary > offsetof(struct ethhdr, h_proto)) return true; return false; } void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; u32 *dest = &regs->data[priv->dreg]; int offset; if (priv->len % NFT_REG32_SIZE) dest[priv->len / NFT_REG32_SIZE] = 0; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb) || skb_mac_header_len(skb) == 0) goto err; if (skb_vlan_tag_present(skb) && nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_copy_vlan(dest, skb, priv->offset, priv->len)) goto err; return; } offset = skb_mac_header(skb) - skb->data; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) goto err; offset = nft_thoff(pkt); break; case NFT_PAYLOAD_INNER_HEADER: offset = nft_payload_inner_offset(pkt); if (offset < 0) goto err; break; default: WARN_ON_ONCE(1); goto err; } offset += priv->offset; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; static int nft_payload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_payload *priv = nft_expr_priv(expr); priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_payload *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_PAYLOAD_DREG, priv->dreg) || nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static bool nft_payload_offload_mask(struct nft_offload_reg *reg, u32 priv_len, u32 field_len) { unsigned int remainder, delta, k; struct nft_data mask = {}; __be32 remainder_mask; if (priv_len == field_len) { memset(&reg->mask, 0xff, priv_len); return true; } else if (priv_len > field_len) { return false; } memset(&mask, 0xff, field_len); remainder = priv_len % sizeof(u32); if (remainder) { k = priv_len / sizeof(u32); delta = field_len - priv_len; remainder_mask = htonl(~((1 << (delta * BITS_PER_BYTE)) - 1)); mask.data[k] = (__force u32)remainder_mask; } memcpy(&reg->mask, &mask, field_len); return true; } static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->offset) { case offsetof(struct ethhdr, h_source): if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, src, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_dest): if (!nft_payload_offload_mask(reg, priv->len, ETH_ALEN)) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_ETH_ADDRS, eth_addrs, dst, ETH_ALEN, reg); break; case offsetof(struct ethhdr, h_proto): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, n_proto, sizeof(__be16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan, vlan_tci, sizeof(__be16), reg, NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, vlan_tpid, sizeof(__be16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; case offsetof(struct vlan_ethhdr, h_vlan_TCI) + sizeof(struct vlan_hdr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tci, sizeof(__be16), reg, NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tpid, sizeof(__be16), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; default: return -EOPNOTSUPP; } return 0; } static int nft_payload_offload_ip(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->offset) { case offsetof(struct iphdr, saddr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, src, sizeof(struct in_addr), reg); nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, daddr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(struct in_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4, dst, sizeof(struct in_addr), reg); nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV4_ADDRS); break; case offsetof(struct iphdr, protocol): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; default: return -EOPNOTSUPP; } return 0; } static int nft_payload_offload_ip6(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->offset) { case offsetof(struct ipv6hdr, saddr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, src, sizeof(struct in6_addr), reg); nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, daddr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(struct in6_addr))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6, dst, sizeof(struct in6_addr), reg); nft_flow_rule_set_addr_type(flow, FLOW_DISSECTOR_KEY_IPV6_ADDRS); break; case offsetof(struct ipv6hdr, nexthdr): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__u8))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_BASIC, basic, ip_proto, sizeof(__u8), reg); nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_TRANSPORT); break; default: return -EOPNOTSUPP; } return 0; } static int nft_payload_offload_nh(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { int err; switch (ctx->dep.l3num) { case htons(ETH_P_IP): err = nft_payload_offload_ip(ctx, flow, priv); break; case htons(ETH_P_IPV6): err = nft_payload_offload_ip6(ctx, flow, priv); break; default: return -EOPNOTSUPP; } return err; } static int nft_payload_offload_tcp(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->offset) { case offsetof(struct tcphdr, source): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct tcphdr, dest): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; default: return -EOPNOTSUPP; } return 0; } static int nft_payload_offload_udp(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; switch (priv->offset) { case offsetof(struct udphdr, source): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, src, sizeof(__be16), reg); break; case offsetof(struct udphdr, dest): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_PORTS, tp, dst, sizeof(__be16), reg); break; default: return -EOPNOTSUPP; } return 0; } static int nft_payload_offload_th(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_payload *priv) { int err; switch (ctx->dep.protonum) { case IPPROTO_TCP: err = nft_payload_offload_tcp(ctx, flow, priv); break; case IPPROTO_UDP: err = nft_payload_offload_udp(ctx, flow, priv); break; default: return -EOPNOTSUPP; } return err; } static int nft_payload_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_payload *priv = nft_expr_priv(expr); int err; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: err = nft_payload_offload_ll(ctx, flow, priv); break; case NFT_PAYLOAD_NETWORK_HEADER: err = nft_payload_offload_nh(ctx, flow, priv); break; case NFT_PAYLOAD_TRANSPORT_HEADER: err = nft_payload_offload_th(ctx, flow, priv); break; default: err = -EOPNOTSUPP; break; } return err; } static const struct nft_expr_ops nft_payload_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, .offload = nft_payload_offload, }; const struct nft_expr_ops nft_payload_fast_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .eval = nft_payload_eval, .init = nft_payload_init, .dump = nft_payload_dump, .offload = nft_payload_offload, }; void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; u32 *dest = &regs->data[priv->dreg]; int offset; if (priv->len % NFT_REG32_SIZE) dest[priv->len / NFT_REG32_SIZE] = 0; switch (priv->base) { case NFT_PAYLOAD_TUN_HEADER: if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN)) goto err; offset = tun_ctx->inner_tunoff; break; case NFT_PAYLOAD_LL_HEADER: if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL)) goto err; offset = tun_ctx->inner_lloff; break; case NFT_PAYLOAD_NETWORK_HEADER: if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH)) goto err; offset = tun_ctx->inner_nhoff; break; case NFT_PAYLOAD_TRANSPORT_HEADER: if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) goto err; offset = tun_ctx->inner_thoff; break; default: WARN_ON_ONCE(1); goto err; } offset += priv->offset; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; err: regs->verdict.code = NFT_BREAK; } static int nft_payload_inner_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_payload *priv = nft_expr_priv(expr); u32 base; if (!tb[NFTA_PAYLOAD_BASE] || !tb[NFTA_PAYLOAD_OFFSET] || !tb[NFTA_PAYLOAD_LEN] || !tb[NFTA_PAYLOAD_DREG]) return -EINVAL; base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_TUN_HEADER: case NFT_PAYLOAD_LL_HEADER: case NFT_PAYLOAD_NETWORK_HEADER: case NFT_PAYLOAD_TRANSPORT_HEADER: break; default: return -EOPNOTSUPP; } priv->base = base; priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, priv->len); } static const struct nft_expr_ops nft_payload_inner_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), .init = nft_payload_inner_init, .dump = nft_payload_dump, /* direct call to nft_payload_inner_eval(). */ }; static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff) { struct udphdr *uh, _uh; uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh); if (!uh) return false; return (__force bool)uh->check; } static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt, struct sk_buff *skb, unsigned int *l4csum_offset) { if (pkt->fragoff) return -1; switch (pkt->tprot) { case IPPROTO_TCP: *l4csum_offset = offsetof(struct tcphdr, check); break; case IPPROTO_UDP: if (!nft_payload_udp_checksum(skb, nft_thoff(pkt))) return -1; fallthrough; case IPPROTO_UDPLITE: *l4csum_offset = offsetof(struct udphdr, check); break; case IPPROTO_ICMPV6: *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); break; default: return -1; } *l4csum_offset += nft_thoff(pkt); return 0; } static int nft_payload_csum_sctp(struct sk_buff *skb, int offset) { struct sctphdr *sh; if (skb_ensure_writable(skb, offset + sizeof(*sh))) return -1; sh = (struct sctphdr *)(skb->data + offset); sh->checksum = sctp_compute_cksum(skb, offset); skb->ip_summed = CHECKSUM_UNNECESSARY; return 0; } static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt, struct sk_buff *skb, __wsum fsum, __wsum tsum) { int l4csum_offset; __sum16 sum; /* If we cannot determine layer 4 checksum offset or this packet doesn't * require layer 4 checksum recalculation, skip this packet. */ if (nft_payload_l4csum_offset(pkt, skb, &l4csum_offset) < 0) return 0; if (skb_copy_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; /* Checksum mangling for an arbitrary amount of bytes, based on * inet_proto_csum_replace*() functions. */ if (skb->ip_summed != CHECKSUM_PARTIAL) { nft_csum_replace(&sum, fsum, tsum); if (skb->ip_summed == CHECKSUM_COMPLETE) { skb->csum = ~csum_add(csum_sub(~(skb->csum), fsum), tsum); } } else { sum = ~csum_fold(csum_add(csum_sub(csum_unfold(sum), fsum), tsum)); } if (skb_ensure_writable(skb, l4csum_offset + sizeof(sum)) || skb_store_bits(skb, l4csum_offset, &sum, sizeof(sum)) < 0) return -1; return 0; } static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, __wsum fsum, __wsum tsum, int csum_offset) { __sum16 sum; if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; nft_csum_replace(&sum, fsum, tsum); if (skb_ensure_writable(skb, csum_offset + sizeof(sum)) || skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0) return -1; return 0; } struct nft_payload_set { enum nft_payload_bases base:8; u16 offset; u8 len; u8 sreg; u8 csum_type; u8 csum_offset; u8 csum_flags; }; /* This is not struct vlan_hdr. */ struct nft_payload_vlan_hdr { __be16 h_vlan_proto; __be16 h_vlan_TCI; }; static bool nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; __be16 vlan_proto; u16 vlan_tci; if (offset >= offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto)) { *vlan_hlen = VLAN_HLEN; return true; } switch (offset) { case offsetof(struct vlan_ethhdr, h_vlan_proto): if (len == 2) { vlan_proto = nft_reg_load_be16(src); skb->vlan_proto = vlan_proto; } else if (len == 4) { vlanh = (struct nft_payload_vlan_hdr *)src; __vlan_hwaccel_put_tag(skb, vlanh->h_vlan_proto, ntohs(vlanh->h_vlan_TCI)); } else { return false; } break; case offsetof(struct vlan_ethhdr, h_vlan_TCI): if (len != 2) return false; vlan_tci = ntohs(nft_reg_load_be16(src)); skb->vlan_tci = vlan_tci; break; default: return false; } return true; } static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload_set *priv = nft_expr_priv(expr); const u32 *src = &regs->data[priv->sreg]; int offset, csum_offset, vlan_hlen = 0; struct sk_buff *skb = pkt->skb; __wsum fsum, tsum; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; if (skb_vlan_tag_present(skb) && nft_payload_need_vlan_adjust(priv->offset, priv->len)) { if (!nft_payload_set_vlan(src, skb, priv->offset, priv->len, &vlan_hlen)) goto err; if (!vlan_hlen) return; } offset = skb_mac_header(skb) - skb->data - vlan_hlen; break; case NFT_PAYLOAD_NETWORK_HEADER: offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: if (!(pkt->flags & NFT_PKTINFO_L4PROTO) || pkt->fragoff) goto err; offset = nft_thoff(pkt); break; case NFT_PAYLOAD_INNER_HEADER: offset = nft_payload_inner_offset(pkt); if (offset < 0) goto err; break; default: WARN_ON_ONCE(1); goto err; } csum_offset = offset + priv->csum_offset; offset += priv->offset; if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER && priv->base != NFT_PAYLOAD_INNER_HEADER) || skb->ip_summed != CHECKSUM_PARTIAL)) { if (offset + priv->len > skb->len) goto err; fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); if (priv->csum_type == NFT_PAYLOAD_CSUM_INET && nft_payload_csum_inet(skb, src, fsum, tsum, csum_offset)) goto err; if (priv->csum_flags && nft_payload_l4csum_update(pkt, skb, fsum, tsum) < 0) goto err; } if (skb_ensure_writable(skb, max(offset + priv->len, 0)) || skb_store_bits(skb, offset, src, priv->len) < 0) goto err; if (priv->csum_type == NFT_PAYLOAD_CSUM_SCTP && pkt->tprot == IPPROTO_SCTP && skb->ip_summed != CHECKSUM_PARTIAL) { if (pkt->fragoff == 0 && nft_payload_csum_sctp(skb, nft_thoff(pkt))) goto err; } return; err: regs->verdict.code = NFT_BREAK; } static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return err; priv->offset = offset; if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { err = nft_parse_u32_check(tb[NFTA_PAYLOAD_CSUM_OFFSET], U8_MAX, &csum_offset); if (err < 0) return err; priv->csum_offset = csum_offset; } if (tb[NFTA_PAYLOAD_CSUM_FLAGS]) { u32 flags; flags = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_FLAGS])); if (flags & ~NFT_PAYLOAD_L4CSUM_PSEUDOHDR) return -EINVAL; priv->csum_flags = flags; } switch (csum_type) { case NFT_PAYLOAD_CSUM_NONE: case NFT_PAYLOAD_CSUM_INET: break; case NFT_PAYLOAD_CSUM_SCTP: if (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER) return -EINVAL; if (priv->csum_offset != offsetof(struct sctphdr, checksum)) return -EINVAL; break; default: return -EOPNOTSUPP; } priv->csum_type = csum_type; return nft_parse_register_load(ctx, tb[NFTA_PAYLOAD_SREG], &priv->sreg, priv->len); } static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_payload_set *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_PAYLOAD_SREG, priv->sreg) || nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) || nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) || nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET, htonl(priv->csum_offset)) || nla_put_be32(skb, NFTA_PAYLOAD_CSUM_FLAGS, htonl(priv->csum_flags))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nft_expr_ops nft_payload_set_ops = { .type = &nft_payload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)), .eval = nft_payload_set_eval, .init = nft_payload_set_init, .dump = nft_payload_set_dump, }; static const struct nft_expr_ops * nft_payload_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { enum nft_payload_bases base; unsigned int offset, len; int err; if (tb[NFTA_PAYLOAD_BASE] == NULL || tb[NFTA_PAYLOAD_OFFSET] == NULL || tb[NFTA_PAYLOAD_LEN] == NULL) return ERR_PTR(-EINVAL); base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); switch (base) { case NFT_PAYLOAD_LL_HEADER: case NFT_PAYLOAD_NETWORK_HEADER: case NFT_PAYLOAD_TRANSPORT_HEADER: case NFT_PAYLOAD_INNER_HEADER: break; default: return ERR_PTR(-EOPNOTSUPP); } if (tb[NFTA_PAYLOAD_SREG] != NULL) { if (tb[NFTA_PAYLOAD_DREG] != NULL) return ERR_PTR(-EINVAL); return &nft_payload_set_ops; } if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); err = nft_parse_u32_check(tb[NFTA_PAYLOAD_LEN], U8_MAX, &len); if (err < 0) return ERR_PTR(err); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER) return &nft_payload_fast_ops; else return &nft_payload_ops; } struct nft_expr_type nft_payload_type __read_mostly = { .name = "payload", .select_ops = nft_payload_select_ops, .inner_ops = &nft_payload_inner_ops, .policy = nft_payload_policy, .maxattr = NFTA_PAYLOAD_MAX, .owner = THIS_MODULE, };
34 382 384 382 380 13 383 309 1 311 385 382 317 61 8 378 4 384 382 384 384 384 382 1 1 49 6 384 54 28 355 383 388 38 41 41 39 45 44 7 3 34 34 35 1 37 7 44 350 350 354 349 346 230 229 230 228 49 175 7 1 6 181 229 231 230 229 226 1 185 28 28 28 28 32 28 4 12 12 12 12 12 12 10 7 13 15 16 15 5 14 16 16 16 27 27 27 10 18 18 12 15 3 12 10 11 1 12 10 3 13 16 3 127 9 115 6 158 158 157 157 2 85 22 9 60 21 14 3 5 2 2 3 132 4 151 1 1 153 148 21 131 21 19 136 137 17 119 1 67 82 138 137 2 136 58 4 134 116 21 82 64 30 7 6 5 1 2 26 26 26 26 26 23 2 82 78 3 7 8 8 131 131 2 97 39 40 128 3 1 111 3 31 89 114 141 2 136 138 19 5 103 134 21 108 37 1 52 87 138 108 37 137 139 138 110 9 31 9 61 124 10 77 1 74 21 16 16 54 5 1 5 46 2 3 3 3 3 3 3 1 2 2 2 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) output module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <Alan.Cox@linux.org> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Hirokazu Takahashi, <taka@valinux.co.jp> * * See ip_input.c for original log * * Fixes: * Alan Cox : Missing nonblock feature in ip_build_xmit. * Mike Kilburn : htons() missing in ip_build_xmit. * Bradford Johnson: Fix faulty handling of some frames when * no route is found. * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit * (in case if packet not accepted by * output firewall rules) * Mike McLagan : Routing by source * Alexey Kuznetsov: use new route cache * Andi Kleen: Fix broken PMTU recovery and remove * some redundant tests. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Replace ip_reply with ip_send_reply. * Andi Kleen : Split fast and slow ip_build_xmit path * for decreased register pressure on x86 * and more readability. * Marc Boucher : When call_out_firewall returns FW_QUEUE, * silently drop skb instead of failing with -EPERM. * Detlev Wengorz : Copy protocol for fragments. * Hirokazu Takahashi: HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi: sendfile() on UDP works now. */ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> #include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/xfrm.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/checksum.h> #include <net/gso.h> #include <net/inetpeer.h> #include <net/lwtunnel.h> #include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> #include <linux/netlink.h> #include <linux/tcp.h> #include <net/psp.h> static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, int (*output)(struct net *, struct sock *, struct sk_buff *)); /* Generate a checksum for an outgoing IP datagram. */ void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } EXPORT_SYMBOL(ip_send_check); int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS); iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip_local_out); static inline int ip_select_ttl(const struct inet_sock *inet, const struct dst_entry *dst) { int ttl = READ_ONCE(inet->uc_ttl); if (ttl < 0) ttl = ip4_dst_hoplimit(dst); return ttl; } /* * Add an ip header to a skbuff and send it out. * */ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, struct ip_options_rcu *opt, u8 tos) { const struct inet_sock *inet = inet_sk(sk); struct rtable *rt = skb_rtable(skb); struct net *net = sock_net(sk); struct iphdr *iph; /* Build the IP header. */ skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = tos; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; /* Do not bother generating IPID for small packets (eg SYNACK) */ if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; /* TCP packets here are SYNACK with fat IPv4/TCP options. * Avoid using the hashed IP ident generator. */ if (sk->sk_protocol == IPPROTO_TCP) iph->id = (__force __be16)get_random_u16(); else __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; ip_options_build(skb, &opt->opt, daddr, rt); } skb->priority = READ_ONCE(sk->sk_priority); if (!skb->mark) skb->mark = READ_ONCE(sk->sk_mark); /* Send it out. */ return ip_local_out(net, skb->sk, skb); } EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst_dev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); /* OUTOCTETS should be counted after fragment */ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) return -ENOMEM; } if (lwtunnel_xmit_redirect(dst->lwtstate)) { int res = lwtunnel_xmit(skb); if (res != LWTUNNEL_XMIT_CONTINUE) return res; } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int res; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ res = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return res; } rcu_read_unlock(); net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return PTR_ERR(neigh); } static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { struct sk_buff *segs, *nskb; netdev_features_t features; int ret = 0; /* common case: seglen is <= mtu */ if (skb_gso_validate_network_len(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length exceeds the egress MTU. * * This can happen in several cases: * - Forwarding of a TCP GRO skb, when DF flag is not set. * - Forwarding of an skb that arrived on a virtualization interface * (virtio-net/vhost/tap) with TSO/GSO size set by other network * stack. * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an * interface with a smaller MTU. * - Arriving GRO skb (or GSO skb in a virtualized environment) that is * bridged to a NETIF_F_TSO tunnel stacked over an interface with an * insufficient MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { kfree_skb(skb); return -ENOMEM; } consume_skb(skb); skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); err = ip_fragment(net, sk, segs, mtu, ip_finish_output2); if (err && ret == 0) ret = err; } return ret; } static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned int mtu; #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) /* Policy lookup after SNAT yielded a new policy */ if (skb_dst(skb)->xfrm) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(net, sk, skb); } #endif mtu = ip_skb_dst_mtu(sk, skb); if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); if (skb->len > mtu || IPCB(skb)->frag_max_size) return ip_fragment(net, sk, skb, mtu, ip_finish_output2); return ip_finish_output2(net, sk, skb); } static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { int ret; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_SUCCESS: return __ip_finish_output(net, sk, skb); case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } static int ip_mc_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *new_rt; bool do_cn = false; int ret, err; ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); switch (ret) { case NET_XMIT_CN: do_cn = true; fallthrough; case NET_XMIT_SUCCESS: break; default: kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten, * see ipv4_pktinfo_prepare(). */ new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb)); if (new_rt) { new_rt->rt_iif = 0; skb_dst_drop(skb); skb_dst_set(skb, &new_rt->dst); } err = dev_loopback_xmit(net, sk, skb); return (do_cn && err) ? ret : err; } int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = rt->dst.dev; /* * If the indicated interface is up and running, send the packet. */ skb->dev = dev; skb->protocol = htons(ETH_P_IP); /* * Multicasts are looped back for other local users */ if (rt->rt_flags&RTCF_MULTICAST) { if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped by ip_mr_input in any case. Note, that local frames are looped back to be delivered to local recipients. This check is duplicated in ip_mr_input at the moment. */ && ((rt->rt_flags & RTCF_LOCAL) || !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, ip_mc_finish_output); } /* Multicasts with ttl 0 must not go beyond the host */ if (ip_hdr(skb)->ttl == 0) { kfree_skb(skb); return 0; } } if (rt->rt_flags&RTCF_BROADCAST) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, newskb, NULL, newskb->dev, ip_mc_finish_output); } return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev, *indev = skb->dev; int ret_val; rcu_read_lock(); dev = skb_dst_dev_rcu(skb); skb->dev = dev; skb->protocol = htons(ETH_P_IP); ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, indev, dev, ip_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); rcu_read_unlock(); return ret_val; } EXPORT_SYMBOL(ip_output); /* * copy saddr and daddr, possibly using 64bit load/stores * Equivalent to : * iph->saddr = fl4->saddr; * iph->daddr = fl4->daddr; */ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); iph->saddr = fl4->saddr; iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, __u8 tos) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options_rcu *inet_opt; struct flowi4 *fl4; struct rtable *rt; struct iphdr *iph; int res; /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); fl4 = &fl->u.ip4; rt = skb_rtable(skb); if (rt) goto packet_routed; /* Make sure we can route this packet. */ rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); packet_routed: if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0)); skb_reset_network_header(skb); iph = ip_hdr(skb); *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff)); if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df) iph->frag_off = htons(IP_DF); else iph->frag_off = 0; iph->ttl = ip_select_ttl(inet, &rt->dst); iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); /* Transport layer set skb->h.foo itself. */ if (inet_opt && inet_opt->opt.optlen) { iph->ihl += inet_opt->opt.optlen >> 2; ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt); } ip_select_ident_segs(net, skb, sk, skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); res = ip_local_out(net, sk, skb); rcu_read_unlock(); return res; no_route: rcu_read_unlock(); IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES); return -EHOSTUNREACH; } EXPORT_SYMBOL(__ip_queue_xmit); int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl) { return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos)); } EXPORT_SYMBOL(ip_queue_xmit); static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) { to->pkt_type = from->pkt_type; to->priority = from->priority; to->protocol = from->protocol; to->skb_iif = from->skb_iif; skb_dst_drop(to); skb_dst_copy(to, from); to->dev = from->dev; to->mark = from->mark; skb_copy_hash(to, from); #ifdef CONFIG_NET_SCHED to->tc_index = from->tc_index; #endif nf_copy(to, from); skb_ext_copy(to, from); #if IS_ENABLED(CONFIG_IP_VS) to->ipvs_property = from->ipvs_property; #endif skb_copy_secmark(to, from); } static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph = ip_hdr(skb); if ((iph->frag_off & htons(IP_DF)) == 0) return ip_do_fragment(net, sk, skb, output); if (unlikely(!skb->ignore_df || (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size > mtu))) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); kfree_skb(skb); return -EMSGSIZE; } return ip_do_fragment(net, sk, skb, output); } void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, unsigned int hlen, struct ip_fraglist_iter *iter) { unsigned int first_len = skb_pagelen(skb); iter->frag = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); iter->offset = 0; iter->iph = iph; iter->hlen = hlen; skb->data_len = first_len - skb_headlen(skb); skb->len = first_len; iph->tot_len = htons(first_len); iph->frag_off = htons(IP_MF); ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_init); void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { unsigned int hlen = iter->hlen; struct iphdr *iph = iter->iph; struct sk_buff *frag; frag = iter->frag; frag->ip_summed = CHECKSUM_NONE; skb_reset_transport_header(frag); __skb_push(frag, hlen); skb_reset_network_header(frag); memcpy(skb_network_header(frag), iph, hlen); iter->iph = ip_hdr(frag); iph = iter->iph; iph->tot_len = htons(frag->len); ip_copy_metadata(frag, skb); iter->offset += skb->len - hlen; iph->frag_off = htons(iter->offset >> 3); if (frag->next) iph->frag_off |= htons(IP_MF); /* Ready, complete checksum */ ip_send_check(iph); } EXPORT_SYMBOL(ip_fraglist_prepare); void ip_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int ll_rs, unsigned int mtu, bool DF, struct ip_frag_state *state) { struct iphdr *iph = ip_hdr(skb); state->DF = DF; state->hlen = hlen; state->ll_rs = ll_rs; state->mtu = mtu; state->left = skb->len - hlen; /* Space per frame */ state->ptr = hlen; /* Where to start from */ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; state->not_last_frag = iph->frag_off & htons(IP_MF); } EXPORT_SYMBOL(ip_frag_init); static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, bool first_frag) { /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; /* ANK: dirty, but effective trick. Upgrade options only if * the segment to be fragmented was THE FIRST (otherwise, * options are already fixed) and make it ONCE * on the initial skb, so that all the following fragments * will inherit fixed options. */ if (first_frag) ip_options_fragment(from); } struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) { unsigned int len = state->left; struct sk_buff *skb2; struct iphdr *iph; /* IF: it doesn't fit, use 'mtu' - the data space left */ if (len > state->mtu) len = state->mtu; /* IF: we are not sending up to and including the packet end then align the next start on an eight byte boundary */ if (len < state->left) { len &= ~7; } /* Allocate buffer */ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); if (!skb2) return ERR_PTR(-ENOMEM); /* * Set up data on packet */ ip_copy_metadata(skb2, skb); skb_reserve(skb2, state->ll_rs); skb_put(skb2, len + state->hlen); skb_reset_network_header(skb2); skb2->transport_header = skb2->network_header + state->hlen; /* * Charge the memory for the fragment to any owner * it might possess */ if (skb->sk) skb_set_owner_w(skb2, skb->sk); /* * Copy the packet header into the new buffer. */ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen); /* * Copy a block of the IP datagram. */ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len)) BUG(); state->left -= len; /* * Fill in the new header fields. */ iph = ip_hdr(skb2); iph->frag_off = htons((state->offset >> 3)); if (state->DF) iph->frag_off |= htons(IP_DF); /* * Added AC : If we are fragmenting a fragment that's not the * last fragment then keep MF on each bit */ if (state->left > 0 || state->not_last_frag) iph->frag_off |= htons(IP_MF); state->ptr += len; state->offset += len; iph->tot_len = htons(len + state->hlen); ip_send_check(iph); return skb2; } EXPORT_SYMBOL(ip_frag_next); /* * This IP datagram is too large to be sent in one piece. Break it up into * smaller pieces (each of size equal to IP header plus * a block of the data of the original IP data part) that will yet fit in a * single device frame, and queue such a frame for sending. */ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { struct iphdr *iph; struct sk_buff *skb2; u8 tstamp_type = skb->tstamp_type; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; int err = 0; /* for offloaded checksums cleanup checksum before fragmentation */ if (skb->ip_summed == CHECKSUM_PARTIAL && (err = skb_checksum_help(skb))) goto fail; /* * Point into the IP datagram header. */ iph = ip_hdr(skb); mtu = ip_skb_dst_mtu(sk, skb); if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) mtu = IPCB(skb)->frag_max_size; /* * Setup starting values. */ hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing * one, it is not prohibited. In this case fall back to copying. * * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ if (skb_has_frag_list(skb)) { struct sk_buff *frag, *frag2; unsigned int first_len = skb_pagelen(skb); if (first_len - hlen > mtu || ((first_len - hlen) & 7) || ip_is_fragment(iph) || skb_cloned(skb) || skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || skb_headroom(frag) < hlen + ll_rs) goto slow_path_clean; /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path_clean; BUG_ON(frag->sk); if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; } skb->truesize -= frag->truesize; } /* Everything is OK. Generate! */ ip_fraglist_init(skb, iph, hlen, &iter); for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) { bool first_frag = (iter.offset == 0); IPCB(iter.frag)->flags = IPCB(skb)->flags; ip_fraglist_prepare(skb, &iter); if (first_frag && IPCB(skb)->opt.optlen) { /* ipcb->opt is not populated for frags * coming from __ip_make_skb(), * ip_options_fragment() needs optlen */ IPCB(iter.frag)->opt.optlen = IPCB(skb)->opt.optlen; ip_options_fragment(iter.frag); ip_send_check(iter.iph); } } skb_set_delivery_time(skb, tstamp, tstamp_type); err = output(net, sk, skb); if (!err) IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); if (err || !iter.frag) break; skb = ip_fraglist_next(&iter); } if (err == 0) { IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return 0; } kfree_skb_list(iter.frag); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; slow_path_clean: skb_walk_frags(skb, frag2) { if (frag2 == frag) break; frag2->sk = NULL; frag2->destructor = NULL; skb->truesize += frag2->truesize; } } slow_path: /* * Fragment the datagram. */ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU, &state); /* * Keep copying data until we run out. */ while (state.left > 0) { bool first_frag = (state.offset == 0); skb2 = ip_frag_next(skb, &state); if (IS_ERR(skb2)) { err = PTR_ERR(skb2); goto fail; } ip_frag_ipcb(skb, skb2, first_frag); /* * Put this fragment into the sending queue. */ skb_set_delivery_time(skb2, tstamp, tstamp_type); err = output(net, sk, skb2); if (err) goto fail; IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); } consume_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); return err; } EXPORT_SYMBOL(ip_do_fragment); int ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct msghdr *msg = from; if (skb->ip_summed == CHECKSUM_PARTIAL) { if (!copy_from_iter_full(to, len, &msg->msg_iter)) return -EFAULT; } else { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, odd); } return 0; } EXPORT_SYMBOL(ip_generic_getfrag); static int __ip_append_data(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; int copy; int err; int offset = 0; bool zc = false; unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = dst_rtable(cork->dst); bool paged, hold_tskey = false, extra_uref = false; unsigned int wmem_alloc_delta = 0; u32 tskey = 0; skb = skb_peek_tail(queue); exthdrlen = !skb ? rt->dst.header_len : 0; mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu; if (cork->length + length > maxnonfragsize - fragheaderlen) { ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu - (opt ? opt->optlen : 0)); return -EMSGSIZE; } /* * transhdrlen > 0 means that this is the first fragment and we wish * it won't be fragmented in the future. */ if (transhdrlen && length + fragheaderlen <= mtu && rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) && (!(flags & MSG_MORE) || cork->gso_size) && (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM))) csummode = CHECKSUM_PARTIAL; if ((flags & MSG_ZEROCOPY) && length) { struct msghdr *msg = from; if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) return -EINVAL; /* Leave uarg NULL if can't zerocopy, callers should * be able to handle it. */ if ((rt->dst.dev->features & NETIF_F_SG) && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ if (rt->dst.dev->features & NETIF_F_SG && csummode == CHECKSUM_PARTIAL) { paged = true; zc = true; } else { uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } } else if ((flags & MSG_SPLICE_PAGES) && length) { if (inet_test_bit(HDRINCL, sk)) return -EPERM; if (rt->dst.dev->features & NETIF_F_SG && getfrag == ip_generic_getfrag) /* We need an empty buffer to attach stuff to */ paged = true; else flags &= ~MSG_SPLICE_PAGES; } cork->length += length; if (cork->tx_flags & SKBTX_ANY_TSTAMP && READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { if (cork->flags & IPCORK_TS_OPT_ID) { tskey = cork->ts_opt_id; } else { tskey = atomic_inc_return(&sk->sk_tskey) - 1; hold_tskey = true; } } /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, * each of segments is IP fragment ready for sending to network after * adding appropriate IP header. */ if (!skb) goto alloc_new_skb; while (length > 0) { /* Check if the remaining data fits into current packet. */ copy = mtu - skb->len; if (copy < length) copy = maxfraglen - skb->len; if (copy <= 0) { char *data; unsigned int datalen; unsigned int fraglen; unsigned int fraggap; unsigned int alloclen, alloc_extra; unsigned int pagedlen; struct sk_buff *skb_prev; alloc_new_skb: skb_prev = skb; if (skb_prev) fraggap = skb_prev->len - maxfraglen; else fraggap = 0; /* * If remaining data exceeds the mtu, * we know we need more fragment(s). */ datalen = length + fraggap; if (datalen > mtu - fragheaderlen) datalen = maxfraglen - fragheaderlen; fraglen = datalen + fragheaderlen; pagedlen = 0; alloc_extra = hh_len + 15; alloc_extra += exthdrlen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ if (datalen == length + fraggap) alloc_extra += rt->dst.trailer_len; if ((flags & MSG_MORE) && !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else if (!paged && (fraglen + alloc_extra < SKB_MAX_ALLOC || !(rt->dst.dev->features & NETIF_F_SG))) alloclen = fraglen; else { alloclen = fragheaderlen + transhdrlen; pagedlen = datalen - transhdrlen; } alloclen += alloc_extra; if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen, (flags & MSG_DONTWAIT), &err); } else { skb = NULL; if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 2 * sk->sk_sndbuf) skb = alloc_skb(alloclen, sk->sk_allocation); if (unlikely(!skb)) err = -ENOBUFS; } if (!skb) goto error; /* * Fill in the control structures */ skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); /* * Find where to start putting bytes. */ data = skb_put(skb, fraglen + exthdrlen - pagedlen); skb_set_network_header(skb, exthdrlen); skb->transport_header = (skb->network_header + fragheaderlen); data += fragheaderlen + exthdrlen; if (fraggap) { skb->csum = skb_copy_and_csum_bits( skb_prev, maxfraglen, data + transhdrlen, fraggap); skb_prev->csum = csum_sub(skb_prev->csum, skb->csum); data += fraggap; pskb_trim_unique(skb_prev, maxfraglen); } copy = datalen - transhdrlen - fraggap - pagedlen; /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ if (copy > 0 && INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; } else if (flags & MSG_SPLICE_PAGES) { copy = 0; } offset += copy; length -= copy + transhdrlen; transhdrlen = 0; exthdrlen = 0; csummode = CHECKSUM_NONE; /* only the initial fragment is time stamped */ skb_shinfo(skb)->tx_flags = cork->tx_flags; cork->tx_flags = 0; skb_shinfo(skb)->tskey = tskey; tskey = 0; skb_zcopy_set(skb, uarg, &extra_uref); if ((flags & MSG_CONFIRM) && !skb_prev) skb_set_dst_pending_confirm(skb, 1); /* * Put the packet on the pending queue. */ if (!skb->destructor) { skb->destructor = sock_wfree; skb->sk = sk; wmem_alloc_delta += skb->truesize; } __skb_queue_tail(queue, skb); continue; } if (copy > length) copy = length; if (!(rt->dst.dev->features&NETIF_F_SG) && skb_tailroom(skb) >= copy) { unsigned int off; off = skb->len; if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, skb_put(skb, copy), offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; } } else if (flags & MSG_SPLICE_PAGES) { struct msghdr *msg = from; err = -EIO; if (WARN_ON_ONCE(copy > msg->msg_iter.count)) goto error; err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) goto error; copy = err; wmem_alloc_delta += copy; } else if (!zc) { int i = skb_shinfo(skb)->nr_frags; err = -ENOMEM; if (!sk_page_frag_refill(sk, pfrag)) goto error; skb_zcopy_downgrade_managed(skb); if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { err = -EMSGSIZE; if (i == MAX_SKB_FRAGS) goto error; __skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, 0); skb_shinfo(skb)->nr_frags = ++i; get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; pfrag->offset += copy; skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); skb_len_add(skb, copy); wmem_alloc_delta += copy; } else { err = skb_zerocopy_iter_dgram(skb, from, copy); if (err < 0) goto error; } offset += copy; length -= copy; } if (wmem_alloc_delta) refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); return 0; error_efault: err = -EFAULT; error: net_zcopy_put_abort(uarg, extra_uref); cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); if (hold_tskey) atomic_dec(&sk->sk_tskey); return err; } static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, struct ipcm_cookie *ipc, struct rtable **rtp) { struct ip_options_rcu *opt; struct rtable *rt; rt = *rtp; if (unlikely(!rt)) return -EFAULT; cork->fragsize = ip_sk_use_pmtu(sk) ? dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; /* * setup for corking. */ opt = ipc->opt; if (opt) { if (!cork->opt) { cork->opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); if (unlikely(!cork->opt)) return -ENOBUFS; } memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen); cork->flags |= IPCORK_OPT; cork->addr = ipc->addr; } cork->gso_size = ipc->gso_size; cork->dst = &rt->dst; /* We stole this route, caller should not release it. */ *rtp = NULL; cork->length = 0; cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->flags |= IPCORK_TS_OPT_ID; cork->ts_opt_id = ipc->sockc.ts_opt_id; } return 0; } /* * ip_append_data() can make one large IP datagram from many pieces of * data. Each piece will be held on the socket until * ip_push_pending_frames() is called. Each piece can be a page or * non-page data. * * Not only UDP, other transport protocols - e.g. raw sockets - can use * this interface potentially. * * LATER: length must be adjusted by pad at tail, when it is required. */ int ip_append_data(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); int err; if (flags&MSG_PROBE) return 0; if (skb_queue_empty(&sk->sk_write_queue)) { err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp); if (err) return err; } else { transhdrlen = 0; } return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } static void ip_cork_release(struct inet_cork *cork) { cork->flags &= ~IPCORK_OPT; kfree(cork->opt); cork->opt = NULL; dst_release(cork->dst); cork->dst = NULL; } /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ struct sk_buff *__ip_make_skb(struct sock *sk, struct flowi4 *fl4, struct sk_buff_head *queue, struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = dst_rtable(cork->dst); struct iphdr *iph; u8 pmtudisc, ttl; __be16 df = 0; skb = __skb_dequeue(queue); if (!skb) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; skb->truesize += tmp_skb->truesize; tmp_skb->destructor = NULL; tmp_skb->sk = NULL; } /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow * to fragment the frame generated here. No matter, what transforms * how transforms change size of the packet, it will come out. */ skb->ignore_df = ip_sk_ignore_df(sk); /* DF bit is set when we want to see DF on outgoing frames. * If ignore_df is set too, we still allow to fragment this frame * locally. */ pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_PROBE || (skb->len <= dst4_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); if (cork->flags & IPCORK_OPT) opt = cork->opt; if (cork->ttl != 0) ttl = cork->ttl; else if (rt->rt_type == RTN_MULTICAST) ttl = READ_ONCE(inet->mc_ttl); else ttl = ip_select_ttl(inet, &rt->dst); iph = ip_hdr(skb); iph->version = 4; iph->ihl = 5; iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos); iph->frag_off = df; iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); ip_select_ident(net, skb, sk); if (opt) { iph->ihl += opt->optlen >> 2; ip_options_build(skb, opt, cork->addr, rt); } skb->priority = cork->priority; skb->mark = cork->mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); else skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); /* * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ cork->dst = NULL; skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) { u8 icmp_type; /* For such sockets, transhdrlen is zero when do ip_append_data(), * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type. */ if (sk->sk_type == SOCK_RAW && !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; icmp_out_count(net, icmp_type); } ip_cork_release(cork); out: return skb; } int ip_send_skb(struct net *net, struct sk_buff *skb) { int err; err = ip_local_out(net, skb->sk, skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } return err; } int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4) { struct sk_buff *skb; skb = ip_finish_skb(sk, fl4); if (!skb) return 0; /* Netfilter gets whole the not fragmented skb. */ return ip_send_skb(sock_net(sk), skb); } /* * Throw away all pending data on the socket. */ static void __ip_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, struct inet_cork *cork) { struct sk_buff *skb; while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); ip_cork_release(cork); } void ip_flush_pending_frames(struct sock *sk) { __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base); } struct sk_buff *ip_make_skb(struct sock *sk, struct flowi4 *fl4, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm_cookie *ipc, struct rtable **rtp, struct inet_cork *cork, unsigned int flags) { struct sk_buff_head queue; int err; if (flags & MSG_PROBE) return NULL; __skb_queue_head_init(&queue); cork->flags = 0; cork->addr = 0; cork->opt = NULL; err = ip_setup_cork(sk, cork, ipc, rtp); if (err) return ERR_PTR(err); err = __ip_append_data(sk, fl4, &queue, cork, &current->task_frag, getfrag, from, length, transhdrlen, flags); if (err) { __ip_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } return __ip_make_skb(sk, fl4, &queue, cork); } /* * Fetch data from kernel space and fill in checksum if needed. */ static int ip_reply_glue_bits(void *dptr, char *to, int offset, int len, int odd, struct sk_buff *skb) { __wsum csum; csum = csum_partial_copy_nocheck(dptr+offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); return 0; } /* * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. */ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash) { DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data, IP_OPTIONS_DATA_FIXED_SIZE); struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); struct net *net = sock_net(sk); struct sk_buff *nskb; int err; int oif; if (__ip_options_echo(net, &replyopts->opt, skb, sopt)) return; ipcm_init(&ipc); ipc.addr = daddr; ipc.sockc.transmit_time = transmit_time; if (replyopts->opt.optlen) { ipc.opt = replyopts; if (replyopts->opt.srr) daddr = replyopts->opt.faddr; } oif = arg->bound_dev_if; if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark, arg->tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, ip_reply_arg_flowi_flags(arg), daddr, saddr, tcp_hdr(skb)->source, tcp_hdr(skb)->dest, arg->uid); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) return; inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); ipc.sockc.mark = fl4.flowi4_mark; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); if (unlikely(err)) { ip_flush_pending_frames(sk); goto out; } nskb = skb_peek(&sk->sk_write_queue); if (nskb) { if (arg->csumoffset >= 0) *((__sum16 *)skb_transport_header(nskb) + arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); psp_reply_set_decrypted(orig_sk, nskb); } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); ip_push_pending_frames(sk, &fl4); } out: ip_rt_put(rt); } void __init ip_init(void) { ip_rt_init(); inet_initpeers(); #if defined(CONFIG_IP_MULTICAST) igmp_mc_init(); #endif }
3 33 523 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CGROUP_INTERNAL_H #define __CGROUP_INTERNAL_H #include <linux/cgroup.h> #include <linux/kernfs.h> #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> #include <linux/fs_parser.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN]; extern void __init enable_debug_cgroup(void); /* * cgroup_path() takes a spin lock. It is good practice not to take * spin locks within trace point handlers, as they are mostly hidden * from normal view. As cgroup_path() can take the kernfs_rename_lock * spin lock, it is best to not call that function from the trace event * handler. * * Note: trace_cgroup_##type##_enabled() is a static branch that will only * be set when the trace event is enabled. */ #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ unsigned long flags; \ spin_lock_irqsave(&trace_cgroup_path_lock, \ flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ spin_unlock_irqrestore(&trace_cgroup_path_lock, \ flags); \ } \ } while (0) /* * The cgroup filesystem superblock creation/mount context. */ struct cgroup_fs_context { struct kernfs_fs_context kfc; struct cgroup_root *root; struct cgroup_namespace *ns; unsigned int flags; /* CGRP_ROOT_* flags */ /* cgroup1 bits */ bool cpuset_clone_children; bool none; /* User explicitly requested empty subsystem */ bool all_ss; /* Seen 'all' option */ u32 subsys_mask; /* Selected subsystems */ char *name; /* Hierarchy name */ char *release_agent; /* Path for release notifications */ }; static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) { struct kernfs_fs_context *kfc = fc->fs_private; return container_of(kfc, struct cgroup_fs_context, kfc); } struct cgroup_pidlist; struct cgroup_file_ctx { struct cgroup_namespace *ns; struct { void *trigger; } psi; struct { bool started; struct css_task_iter iter; } procs; struct { struct cgroup_pidlist *pidlist; } procs1; struct cgroup_of_peak peak; }; /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. * This M:N relationship is represented by the following link structure * which exists for each association and allows traversing the associations * from both sides. */ struct cgrp_cset_link { /* the cgroup and css_set this link associates */ struct cgroup *cgrp; struct css_set *cset; /* list of cgrp_cset_links anchored at cgrp->cset_links */ struct list_head cset_link; /* list of cgrp_cset_links anchored at css_set->cgrp_links */ struct list_head cgrp_link; }; /* used to track tasks and csets during migration */ struct cgroup_taskset { /* the src and dst cset list running through cset->mg_node */ struct list_head src_csets; struct list_head dst_csets; /* the number of tasks in the set */ int nr_tasks; /* the subsys currently being processed */ int ssid; /* * Fields for cgroup_taskset_*() iteration. * * Before migration is committed, the target migration tasks are on * ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of * the csets on ->dst_csets. ->csets point to either ->src_csets * or ->dst_csets depending on whether migration is committed. * * ->cur_csets and ->cur_task point to the current task position * during iteration. */ struct list_head *csets; struct css_set *cur_cset; struct task_struct *cur_task; }; /* migration context also tracks preloading */ struct cgroup_mgctx { /* * Preloaded source and destination csets. Used to guarantee * atomic success or failure on actual migration. */ struct list_head preloaded_src_csets; struct list_head preloaded_dst_csets; /* tasks and csets to migrate */ struct cgroup_taskset tset; /* subsystems affected by migration */ u32 ss_mask; }; #define CGROUP_TASKSET_INIT(tset) \ { \ .src_csets = LIST_HEAD_INIT(tset.src_csets), \ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \ .csets = &tset.src_csets, \ } #define CGROUP_MGCTX_INIT(name) \ { \ LIST_HEAD_INIT(name.preloaded_src_csets), \ LIST_HEAD_INIT(name.preloaded_dst_csets), \ CGROUP_TASKSET_INIT(name.tset), \ } #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ list_for_each_entry_rcu((root), &cgroup_roots, root_list, \ lockdep_is_held(&cgroup_mutex)) /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end */ #define for_each_subsys(ss, ssid) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } void put_css_set_locked(struct css_set *cset); static inline void put_css_set(struct css_set *cset) { unsigned long flags; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); spin_unlock_irqrestore(&css_set_lock, flags); } /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cset) { refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroup_root *root); struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline); void cgroup_kn_unlock(struct kernfs_node *kn); int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_favor_dynmods(struct cgroup_root *root, bool favor); void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_fs_context *ctx); int cgroup_setup_root(struct cgroup_root *root, u32 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u32 ss_mask); int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp, struct cgroup_mgctx *mgctx); int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx); int cgroup_migrate(struct task_struct *leader, bool threadgroup, struct cgroup_mgctx *mgctx); int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, struct task_struct *tsk); void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); void cgroup_procs_write_finish(struct task_struct *task, enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode); int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ int css_rstat_init(struct cgroup_subsys_state *css); void css_rstat_exit(struct cgroup_subsys_state *css); int ss_rstat_init(struct cgroup_subsys *ss); void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c */ extern const struct proc_ns_operations cgroupns_operations; /* * cgroup-v1.c */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; extern const struct fs_parameter_spec cgroup1_fs_parameters[]; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); int cgroup1_get_tree(struct fs_context *fc); int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */
2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 // SPDX-License-Identifier: GPL-2.0 /* * Witness Service client for CIFS * * Copyright (c) 2020 Samuel Cabrero <scabrero@suse.de> */ #include <linux/kref.h> #include <net/genetlink.h> #include <uapi/linux/cifs/cifs_netlink.h> #include "cifs_swn.h" #include "cifsglob.h" #include "cifsproto.h" #include "fscache.h" #include "cifs_debug.h" #include "netlink.h" static DEFINE_IDR(cifs_swnreg_idr); static DEFINE_MUTEX(cifs_swnreg_idr_mutex); struct cifs_swn_reg { int id; struct kref ref_count; const char *net_name; const char *share_name; bool net_name_notify; bool share_name_notify; bool ip_notify; struct cifs_tcon *tcon; }; static int cifs_swn_auth_info_krb(struct cifs_tcon *tcon, struct sk_buff *skb) { int ret; ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_KRB_AUTH); if (ret < 0) return ret; return 0; } static int cifs_swn_auth_info_ntlm(struct cifs_tcon *tcon, struct sk_buff *skb) { int ret; if (tcon->ses->user_name != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_USER_NAME, tcon->ses->user_name); if (ret < 0) return ret; } if (tcon->ses->password != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_PASSWORD, tcon->ses->password); if (ret < 0) return ret; } if (tcon->ses->domainName != NULL) { ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_DOMAIN_NAME, tcon->ses->domainName); if (ret < 0) return ret; } return 0; } /* * Sends a register message to the userspace daemon based on the registration. * The authentication information to connect to the witness service is bundled * into the message. */ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg) { struct sk_buff *skb; struct genlmsghdr *hdr; enum securityEnum authtype; struct sockaddr_storage *addr; int ret; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &cifs_genl_family, 0, CIFS_GENL_CMD_SWN_REGISTER); if (hdr == NULL) { ret = -ENOMEM; goto nlmsg_fail; } ret = nla_put_u32(skb, CIFS_GENL_ATTR_SWN_REGISTRATION_ID, swnreg->id); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_NET_NAME, swnreg->net_name); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME, swnreg->share_name); if (ret < 0) goto nlmsg_fail; /* * If there is an address stored use it instead of the server address, because we are * in the process of reconnecting to it after a share has been moved or we have been * told to switch to it (client move message). In these cases we unregister from the * server address and register to the new address when we receive the notification. */ if (swnreg->tcon->ses->server->use_swn_dstaddr) addr = &swnreg->tcon->ses->server->swn_dstaddr; else addr = &swnreg->tcon->ses->server->dstaddr; ret = nla_put(skb, CIFS_GENL_ATTR_SWN_IP, sizeof(struct sockaddr_storage), addr); if (ret < 0) goto nlmsg_fail; if (swnreg->net_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_NET_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->share_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->ip_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_IP_NOTIFY); if (ret < 0) goto nlmsg_fail; } authtype = cifs_select_sectype(swnreg->tcon->ses->server, swnreg->tcon->ses->sectype); switch (authtype) { case Kerberos: ret = cifs_swn_auth_info_krb(swnreg->tcon, skb); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to get kerberos auth info: %d\n", __func__, ret); goto nlmsg_fail; } break; case NTLMv2: case RawNTLMSSP: ret = cifs_swn_auth_info_ntlm(swnreg->tcon, skb); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to get NTLM auth info: %d\n", __func__, ret); goto nlmsg_fail; } break; default: cifs_dbg(VFS, "%s: secType %d not supported!\n", __func__, authtype); ret = -EINVAL; goto nlmsg_fail; } genlmsg_end(skb, hdr); genlmsg_multicast(&cifs_genl_family, skb, 0, CIFS_GENL_MCGRP_SWN, GFP_ATOMIC); cifs_dbg(FYI, "%s: Message to register for network name %s with id %d sent\n", __func__, swnreg->net_name, swnreg->id); return 0; nlmsg_fail: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return ret; } /* * Sends an uregister message to the userspace daemon based on the registration */ static int cifs_swn_send_unregister_message(struct cifs_swn_reg *swnreg) { struct sk_buff *skb; struct genlmsghdr *hdr; int ret; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (skb == NULL) return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &cifs_genl_family, 0, CIFS_GENL_CMD_SWN_UNREGISTER); if (hdr == NULL) { ret = -ENOMEM; goto nlmsg_fail; } ret = nla_put_u32(skb, CIFS_GENL_ATTR_SWN_REGISTRATION_ID, swnreg->id); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_NET_NAME, swnreg->net_name); if (ret < 0) goto nlmsg_fail; ret = nla_put_string(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME, swnreg->share_name); if (ret < 0) goto nlmsg_fail; ret = nla_put(skb, CIFS_GENL_ATTR_SWN_IP, sizeof(struct sockaddr_storage), &swnreg->tcon->ses->server->dstaddr); if (ret < 0) goto nlmsg_fail; if (swnreg->net_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_NET_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->share_name_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_SHARE_NAME_NOTIFY); if (ret < 0) goto nlmsg_fail; } if (swnreg->ip_notify) { ret = nla_put_flag(skb, CIFS_GENL_ATTR_SWN_IP_NOTIFY); if (ret < 0) goto nlmsg_fail; } genlmsg_end(skb, hdr); genlmsg_multicast(&cifs_genl_family, skb, 0, CIFS_GENL_MCGRP_SWN, GFP_ATOMIC); cifs_dbg(FYI, "%s: Message to unregister for network name %s with id %d sent\n", __func__, swnreg->net_name, swnreg->id); return 0; nlmsg_fail: genlmsg_cancel(skb, hdr); nlmsg_free(skb); return ret; } /* * Try to find a matching registration for the tcon's server name and share name. * Calls to this function must be protected by cifs_swnreg_idr_mutex. * TODO Try to avoid memory allocations */ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; int id; const char *share_name; const char *net_name; net_name = extract_hostname(tcon->tree_name); if (IS_ERR(net_name)) { int ret; ret = PTR_ERR(net_name); cifs_dbg(VFS, "%s: failed to extract host name from target '%s': %d\n", __func__, tcon->tree_name, ret); return ERR_PTR(-EINVAL); } share_name = extract_sharename(tcon->tree_name); if (IS_ERR(share_name)) { int ret; ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", __func__, tcon->tree_name, ret); kfree(net_name); return ERR_PTR(-EINVAL); } idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { if (strcasecmp(swnreg->net_name, net_name) != 0 || strcasecmp(swnreg->share_name, share_name) != 0) { continue; } cifs_dbg(FYI, "Existing swn registration for %s:%s found\n", swnreg->net_name, swnreg->share_name); kfree(net_name); kfree(share_name); return swnreg; } kfree(net_name); kfree(share_name); return ERR_PTR(-EEXIST); } /* * Get a registration for the tcon's server and share name, allocating a new one if it does not * exists */ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) { struct cifs_swn_reg *reg = NULL; int ret; mutex_lock(&cifs_swnreg_idr_mutex); /* Check if we are already registered for this network and share names */ reg = cifs_find_swn_reg(tcon); if (!IS_ERR(reg)) { kref_get(&reg->ref_count); goto unlock; } else if (PTR_ERR(reg) != -EEXIST) { goto unlock; } reg = kmalloc_obj(struct cifs_swn_reg, GFP_ATOMIC); if (reg == NULL) { ret = -ENOMEM; goto fail_unlock; } kref_init(&reg->ref_count); reg->id = idr_alloc(&cifs_swnreg_idr, reg, 1, 0, GFP_ATOMIC); if (reg->id < 0) { cifs_dbg(FYI, "%s: failed to allocate registration id\n", __func__); ret = reg->id; goto fail; } reg->net_name = extract_hostname(tcon->tree_name); if (IS_ERR(reg->net_name)) { ret = PTR_ERR(reg->net_name); cifs_dbg(VFS, "%s: failed to extract host name from target: %d\n", __func__, ret); goto fail_idr; } reg->share_name = extract_sharename(tcon->tree_name); if (IS_ERR(reg->share_name)) { ret = PTR_ERR(reg->share_name); cifs_dbg(VFS, "%s: failed to extract share name from target: %d\n", __func__, ret); goto fail_net_name; } reg->net_name_notify = true; reg->share_name_notify = true; reg->ip_notify = (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT); reg->tcon = tcon; unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return reg; fail_net_name: kfree(reg->net_name); fail_idr: idr_remove(&cifs_swnreg_idr, reg->id); fail: kfree(reg); fail_unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return ERR_PTR(ret); } static void cifs_swn_reg_release(struct kref *ref) { struct cifs_swn_reg *swnreg = container_of(ref, struct cifs_swn_reg, ref_count); int ret; ret = cifs_swn_send_unregister_message(swnreg); if (ret < 0) cifs_dbg(VFS, "%s: Failed to send unregister message: %d\n", __func__, ret); idr_remove(&cifs_swnreg_idr, swnreg->id); kfree(swnreg->net_name); kfree(swnreg->share_name); kfree(swnreg); } static void cifs_put_swn_reg(struct cifs_swn_reg *swnreg) { mutex_lock(&cifs_swnreg_idr_mutex); kref_put(&swnreg->ref_count, cifs_swn_reg_release); mutex_unlock(&cifs_swnreg_idr_mutex); } static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const char *name, int state) { switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); break; } return 0; } static bool cifs_sockaddr_equal(struct sockaddr_storage *addr1, struct sockaddr_storage *addr2) { if (addr1->ss_family != addr2->ss_family) return false; if (addr1->ss_family == AF_INET) { return (memcmp(&((const struct sockaddr_in *)addr1)->sin_addr, &((const struct sockaddr_in *)addr2)->sin_addr, sizeof(struct in_addr)) == 0); } if (addr1->ss_family == AF_INET6) { return (memcmp(&((const struct sockaddr_in6 *)addr1)->sin6_addr, &((const struct sockaddr_in6 *)addr2)->sin6_addr, sizeof(struct in6_addr)) == 0); } return false; } static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, const struct sockaddr_storage *old, struct sockaddr_storage *dst) { __be16 port = cpu_to_be16(CIFS_PORT); if (old->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)old; port = ipv4->sin_port; } else if (old->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)old; port = ipv6->sin6_port; } if (new->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)new; ipv4->sin_port = port; } else if (new->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)new; ipv6->sin6_port = port; } *dst = *new; return 0; } static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *addr) { int ret = 0; /* Store the reconnect address */ cifs_server_lock(tcon->ses->server); if (cifs_sockaddr_equal(&tcon->ses->server->dstaddr, addr)) goto unlock; ret = cifs_swn_store_swn_addr(addr, &tcon->ses->server->dstaddr, &tcon->ses->server->swn_dstaddr); if (ret < 0) { cifs_dbg(VFS, "%s: failed to store address: %d\n", __func__, ret); goto unlock; } tcon->ses->server->use_swn_dstaddr = true; /* * Unregister to stop receiving notifications for the old IP address. */ ret = cifs_swn_unregister(tcon); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to unregister for witness notifications: %d\n", __func__, ret); goto unlock; } /* * And register to receive notifications for the new IP address now that we have * stored the new address. */ ret = cifs_swn_register(tcon); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to register for witness notifications: %d\n", __func__, ret); goto unlock; } cifs_signal_cifsd_for_reconnect(tcon->ses->server, false); unlock: cifs_server_unlock(tcon->ses->server); return ret; } static int cifs_swn_client_move(struct cifs_swn_reg *swnreg, struct sockaddr_storage *addr) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)addr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)addr; if (addr->ss_family == AF_INET) cifs_dbg(FYI, "%s: move to %pI4\n", __func__, &ipv4->sin_addr); else if (addr->ss_family == AF_INET6) cifs_dbg(FYI, "%s: move to %pI6\n", __func__, &ipv6->sin6_addr); return cifs_swn_reconnect(swnreg->tcon, addr); } int cifs_swn_notify(struct sk_buff *skb, struct genl_info *info) { struct cifs_swn_reg *swnreg; char name[256]; int type; if (info->attrs[CIFS_GENL_ATTR_SWN_REGISTRATION_ID]) { int swnreg_id; swnreg_id = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_REGISTRATION_ID]); mutex_lock(&cifs_swnreg_idr_mutex); swnreg = idr_find(&cifs_swnreg_idr, swnreg_id); mutex_unlock(&cifs_swnreg_idr_mutex); if (swnreg == NULL) { cifs_dbg(FYI, "%s: registration id %d not found\n", __func__, swnreg_id); return -EINVAL; } } else { cifs_dbg(FYI, "%s: missing registration id attribute\n", __func__); return -EINVAL; } if (info->attrs[CIFS_GENL_ATTR_SWN_NOTIFICATION_TYPE]) { type = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_NOTIFICATION_TYPE]); } else { cifs_dbg(FYI, "%s: missing notification type attribute\n", __func__); return -EINVAL; } switch (type) { case CIFS_SWN_NOTIFICATION_RESOURCE_CHANGE: { int state; if (info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_NAME]) { nla_strscpy(name, info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_NAME], sizeof(name)); } else { cifs_dbg(FYI, "%s: missing resource name attribute\n", __func__); return -EINVAL; } if (info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_STATE]) { state = nla_get_u32(info->attrs[CIFS_GENL_ATTR_SWN_RESOURCE_STATE]); } else { cifs_dbg(FYI, "%s: missing resource state attribute\n", __func__); return -EINVAL; } return cifs_swn_resource_state_changed(swnreg, name, state); } case CIFS_SWN_NOTIFICATION_CLIENT_MOVE: { struct sockaddr_storage addr; if (info->attrs[CIFS_GENL_ATTR_SWN_IP]) { nla_memcpy(&addr, info->attrs[CIFS_GENL_ATTR_SWN_IP], sizeof(addr)); } else { cifs_dbg(FYI, "%s: missing IP address attribute\n", __func__); return -EINVAL; } return cifs_swn_client_move(swnreg, &addr); } default: cifs_dbg(FYI, "%s: unknown notification type %d\n", __func__, type); break; } return 0; } int cifs_swn_register(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; int ret; swnreg = cifs_get_swn_reg(tcon); if (IS_ERR(swnreg)) return PTR_ERR(swnreg); ret = cifs_swn_send_register_message(swnreg); if (ret < 0) { cifs_dbg(VFS, "%s: Failed to send swn register message: %d\n", __func__, ret); /* Do not put the swnreg or return error, the echo task will retry */ } return 0; } int cifs_swn_unregister(struct cifs_tcon *tcon) { struct cifs_swn_reg *swnreg; mutex_lock(&cifs_swnreg_idr_mutex); swnreg = cifs_find_swn_reg(tcon); if (IS_ERR(swnreg)) { mutex_unlock(&cifs_swnreg_idr_mutex); return PTR_ERR(swnreg); } mutex_unlock(&cifs_swnreg_idr_mutex); cifs_put_swn_reg(swnreg); return 0; } void cifs_swn_dump(struct seq_file *m) { struct cifs_swn_reg *swnreg; struct sockaddr_in *sa; struct sockaddr_in6 *sa6; int id; seq_puts(m, "Witness registrations:"); mutex_lock(&cifs_swnreg_idr_mutex); idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { seq_printf(m, "\nId: %u Refs: %u Network name: '%s'%s Share name: '%s'%s Ip address: ", id, kref_read(&swnreg->ref_count), swnreg->net_name, swnreg->net_name_notify ? "(y)" : "(n)", swnreg->share_name, swnreg->share_name_notify ? "(y)" : "(n)"); switch (swnreg->tcon->ses->server->dstaddr.ss_family) { case AF_INET: sa = (struct sockaddr_in *) &swnreg->tcon->ses->server->dstaddr; seq_printf(m, "%pI4", &sa->sin_addr.s_addr); break; case AF_INET6: sa6 = (struct sockaddr_in6 *) &swnreg->tcon->ses->server->dstaddr; seq_printf(m, "%pI6", &sa6->sin6_addr.s6_addr); if (sa6->sin6_scope_id) seq_printf(m, "%%%u", sa6->sin6_scope_id); break; default: seq_puts(m, "(unknown)"); } seq_printf(m, "%s", swnreg->ip_notify ? "(y)" : "(n)"); } mutex_unlock(&cifs_swnreg_idr_mutex); seq_puts(m, "\n"); } void cifs_swn_check(void) { struct cifs_swn_reg *swnreg; int id; int ret; mutex_lock(&cifs_swnreg_idr_mutex); idr_for_each_entry(&cifs_swnreg_idr, swnreg, id) { ret = cifs_swn_send_register_message(swnreg); if (ret < 0) cifs_dbg(FYI, "%s: Failed to send register message: %d\n", __func__, ret); } mutex_unlock(&cifs_swnreg_idr_mutex); }
90 10 13690 13765 2083 2 12 1052 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kref.h - library routines for handling generic reference counted objects * * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2004 IBM Corp. * * based on kobject.h which was: * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org> * Copyright (C) 2002-2003 Open Source Development Labs */ #ifndef _KREF_H_ #define _KREF_H_ #include <linux/spinlock.h> #include <linux/refcount.h> struct kref { refcount_t refcount; }; #define KREF_INIT(n) { .refcount = REFCOUNT_INIT(n), } /** * kref_init - initialize object. * @kref: object in question. */ static inline void kref_init(struct kref *kref) { refcount_set(&kref->refcount, 1); } static inline unsigned int kref_read(const struct kref *kref) { return refcount_read(&kref->refcount); } /** * kref_get - increment refcount for object. * @kref: object. */ static inline void kref_get(struct kref *kref) { refcount_inc(&kref->refcount); } /** * kref_put - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * * Decrement the refcount, and if 0, call @release. The caller may not * pass NULL or kfree() as the release function. * * Return: 1 if this call removed the object, otherwise return 0. Beware, * if this function returns 0, another caller may have removed the object * by the time this function returns. The return value is only certain * if you want to see if the object is definitely released. */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; } return 0; } /** * kref_put_mutex - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @mutex: Mutex which protects the release function. * * This variant of kref_lock() calls the @release function with the @mutex * held. The @release function will release the mutex. */ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true, mutex) { if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) { release(kref); return 1; } return 0; } /** * kref_put_lock - Decrement refcount for object * @kref: Object * @release: Pointer to the function that will clean up the object when the * last reference to the object is released. * @lock: Spinlock which protects the release function. * * This variant of kref_lock() calls the @release function with the @lock * held. The @release function will release the lock. */ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) __cond_acquires(true, lock) { if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; } return 0; } /** * kref_get_unless_zero - Increment refcount for object unless it is zero. * @kref: object. * * This function is intended to simplify locking around refcounting for * objects that can be looked up from a lookup structure, and which are * removed from that lookup structure in the object destructor. * Operations on such objects require at least a read lock around * lookup + kref_get, and a write lock around kref_put + remove from lookup * structure. Furthermore, RCU implementations become extremely tricky. * With a lookup followed by a kref_get_unless_zero *with return value check* * locking in the kref_put path can be deferred to the actual removal from * the lookup structure and RCU lookups become trivial. * * Return: non-zero if the increment succeeded. Otherwise return 0. */ static inline int __must_check kref_get_unless_zero(struct kref *kref) { return refcount_inc_not_zero(&kref->refcount); } #endif /* _KREF_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MDS_CLIENT_H #define _FS_CEPH_MDS_CLIENT_H #include <linux/completion.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/utsname.h> #include <linux/ktime.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> #include <linux/ceph/auth.h> #include "mdsmap.h" #include "metric.h" #include "super.h" /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { CEPHFS_FEATURE_MIMIC = 8, CEPHFS_FEATURE_REPLY_ENCODING, CEPHFS_FEATURE_RECLAIM_CLIENT, CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, CEPHFS_FEATURE_32BITS_RETRY_FWD, CEPHFS_FEATURE_NEW_SNAPREALM_INFO, CEPHFS_FEATURE_HAS_OWNER_UIDGID, CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ CEPHFS_FEATURE_32BITS_RETRY_FWD, \ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ CEPHFS_FEATURE_MDS_AUTH_CAPS_CHECK, \ } /* * Some lock dependencies: * * session->s_mutex * mdsc->mutex * * mdsc->snap_rwsem * * ci->i_ceph_lock * mdsc->snap_flush_lock * mdsc->cap_delay_lock * */ struct ceph_fs_client; struct ceph_cap; #define MDS_AUTH_UID_ANY -1 struct ceph_mds_cap_match { s64 uid; /* default to MDS_AUTH_UID_ANY */ u32 num_gids; u32 *gids; /* use these GIDs */ char *path; /* require path to be child of this (may be "" or "/" for any) */ char *fs_name; bool root_squash; /* default to false */ }; struct ceph_mds_cap_auth { struct ceph_mds_cap_match match; bool readable; bool writeable; }; /* * parsed info about a single inode. pointers are into the encoded * on-wire structures within the mds reply message payload. */ struct ceph_mds_reply_info_in { struct ceph_mds_reply_inode *in; struct ceph_dir_layout dir_layout; u32 symlink_len; char *symlink; u32 xattr_len; char *xattr_data; u64 inline_version; u32 inline_len; char *inline_data; u32 pool_ns_len; char *pool_ns_data; u64 max_bytes; u64 max_files; s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; u8 *fscrypt_auth; u8 *fscrypt_file; u32 fscrypt_auth_len; u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; struct ceph_mds_reply_dir_entry { bool is_nokey; char *name; u32 name_len; u32 raw_hash; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; loff_t offset; }; struct ceph_mds_reply_xattr { char *xattr_value; size_t xattr_value_len; }; /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, * and directory contents (for readdir results), or * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u8 *altname; u32 dname_len; u32 altname_len; struct ceph_mds_reply_lease *dlease; struct ceph_mds_reply_xattr xattr_info; /* extra */ union { /* for fcntl F_GETLK results */ struct ceph_filelock *filelock_reply; /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; bool dir_end; bool dir_complete; bool hash_order; bool offset_hash; struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ struct { bool has_create_ino; u64 ino; }; }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ void *snapblob; int snapblob_len; }; /* * cap releases are batched and sent to the MDS en masse. * * Account for per-message overhead of mds_cap_release header * and __le32 for osd epoch barrier trailing field. */ #define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ sizeof(struct ceph_mds_cap_item)) /* * state associated with each MDS<->client session */ enum { CEPH_MDS_SESSION_NEW = 1, CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, CEPH_MDS_SESSION_RESTARTING = 5, CEPH_MDS_SESSION_RECONNECTING = 6, CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_CLOSED = 8, CEPH_MDS_SESSION_REJECTED = 9, }; struct ceph_mds_session { struct ceph_mds_client *s_mdsc; int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ struct ceph_connection s_con; struct ceph_auth_handshake s_auth; atomic_t s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ /* protected by s_cap_lock */ spinlock_t s_cap_lock; refcount_t s_ref; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; /* See ceph_inode_info->i_dirty_item. */ struct list_head s_cap_dirty; /* inodes w/ dirty caps */ /* See ceph_inode_info->i_flushing_item. */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ struct xarray s_delegated_inos; }; /* * modes of choosing which MDS to send a request to */ enum { USE_ANY_MDS, USE_RANDOM_MDS, USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ }; struct ceph_mds_request; struct ceph_mds_client; /* * request completion callback */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); /* * wait for request completion callback */ typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); /* * an in-flight mds request */ struct ceph_mds_request { u64 r_tid; /* transaction id */ struct rb_node r_node; struct ceph_mds_client *r_mdsc; struct kref r_kref; int r_op; /* mds op code */ /* operation on what? */ struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ struct inode *r_new_inode; /* new inode (for creates) */ const struct qstr *r_dname; /* stable name (for ->d_revalidate) */ #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ #define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ #define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; struct ceph_fscrypt_auth *r_fscrypt_auth; u64 r_fscrypt_file; u8 *r_altname; /* fscrypt binary crypttext for long filenames */ u32 r_altname_len; /* length of r_altname */ int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; struct mnt_idmap *r_mnt_idmap; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ /* data payload is used for xattr ops */ struct ceph_pagelist *r_pagelist; /* what caps shall we drop? */ int r_inode_drop, r_inode_unless; int r_dentry_drop, r_dentry_unless; int r_old_dentry_drop, r_old_dentry_unless; struct inode *r_old_inode; int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_start_latency; /* start time to measure latency */ unsigned long r_end_latency; /* finish time to measure latency */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ /* link unsafe requests to parent directory, for fsync */ struct inode *r_unsafe_dir; struct list_head r_unsafe_dir_item; /* unsafe requests that modify the target inode */ struct list_head r_unsafe_target_item; struct ceph_mds_session *r_session; int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ u64 r_deleg_ino; struct list_head r_wait; struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; int r_feature_needed; struct ceph_cap_reservation r_caps_reservation; }; struct ceph_pool_perm { struct rb_node node; int perm; s64 pool; size_t pool_ns_len; char pool_ns[]; }; struct ceph_snapid_map { struct rb_node node; struct list_head lru; atomic_t ref; dev_t dev; u64 snap; unsigned long last_used; }; /* * node for list of quotarealm inodes that are not visible from the filesystem * mountpoint, but required to handle, e.g. quotas. */ struct ceph_quotarealm_inode { struct rb_node node; u64 ino; unsigned long timeout; /* last time a lookup failed for this inode */ struct mutex mutex; struct inode *inode; }; #ifdef CONFIG_DEBUG_FS struct cap_wait { struct list_head list; u64 ino; pid_t tgid; int need; int want; }; #endif enum { CEPH_MDSC_STOPPING_BEGIN = 1, CEPH_MDSC_STOPPING_FLUSHING = 2, CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* * mds client state */ struct ceph_mds_client { struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; struct completion safe_umount_waiters; wait_queue_head_t session_close_wq; struct list_head waiting_for_map; int mdsmap_err; struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ spinlock_t stopping_lock; /* protect snap_empty */ int stopping; /* the stage of shutting down */ atomic_t stopping_blockers; struct completion stopping_waiter; atomic64_t dirty_folios; wait_queue_head_t flush_end_wq; atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we * need to track quota realms. */ struct rb_root quotarealms_inodes; struct mutex quotarealms_inodes_mutex; /* * snap_rwsem will cover cap linkage into snaprealms, and * realm snap contexts. (later, we can do per-realm snap * contexts locks..) the empty list contains realms with no * references (implying they contain no inodes with caps) that * should be destroyed. */ u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ u64 oldest_tid; /* oldest incomplete mds request, excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */ spinlock_t cap_delay_lock; /* protects cap_delay_list and cap_unlink_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; u64 last_cap_flush_tid; struct list_head cap_flush_list; struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; struct work_struct cap_unlink_work; /* * Cap reservations * * Maintain a global pool of preallocated struct ceph_caps, referenced * by struct ceph_caps_reservations. This ensures that we preallocate * memory needed to successfully process an MDS response. (If an MDS * sends us cap information and we fail to process it, we will have * problems due to the client and MDS being out of sync.) * * Reservations are 'owned' by a ceph_cap_reservation context. */ spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ #ifdef CONFIG_DEBUG_FS struct list_head cap_wait_list; #endif int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ spinlock_t dentry_list_lock; struct list_head dentry_leases; /* fifo list */ struct list_head dentry_dir_leases; /* lru list */ struct ceph_client_metric metric; spinlock_t snapid_map_lock; struct rb_root snapid_map_tree; struct list_head snapid_map_lru; struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; u32 s_cap_auths_num; struct ceph_mds_cap_auth *s_cap_auths; char nodename[__NEW_UTS_LEN + 1]; }; extern const char *ceph_mds_op_name(int op); extern bool check_session_state(struct ceph_mds_session *s); void inc_session_sequence(struct ceph_mds_session *s); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); extern const char *ceph_session_state_name(int s); extern struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); } extern void ceph_mdsc_release_request(struct kref *kref); static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) { kref_put(&req->r_kref, ceph_mdsc_release_request); } extern void send_flush_mdlog(struct ceph_mds_session *s); extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state); extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); extern int ceph_mds_check_access(struct ceph_mds_client *mdsc, char *tpath, int mask); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); /* * Structure to group path-related output parameters for build_*_path functions */ struct ceph_path_info { const char *path; int pathlen; struct ceph_vino vino; bool freepath; }; static inline void ceph_mdsc_free_path_info(const struct ceph_path_info *path_info) { if (path_info && path_info->freepath && !IS_ERR_OR_NULL(path_info->path)) __putname((char *)path_info->path - (PATH_MAX - 1 - path_info->pathlen)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, struct ceph_path_info *path_info, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq); extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps); static inline int ceph_wait_on_async_create(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, TASK_KILLABLE); } extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); extern bool enable_unsafe_idmap; #endif
2367 1 32 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))) return 0; if (!inode->i_rdev) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else /* S_ISCHR by the test above */ type = DEVCG_DEV_CHAR; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
21 22 24 47 11 10 1 149 29 20 264 266 1 73 72 1 193 193 2 47 47 36 11 47 907 2 6 2 6 108 4 197 167 23 189 67 67 1 67 26 3 11 47 330 6 6 4 4 13 10 11 11 11 11 2 179 3 2 2 49 27 390 1 2 255 1 34 2 2 2 6 273 5 271 271 269 270 267 13 11 267 267 2 14 13 14 14 3 3 3 3 3 3 2 3 2 3 3 9 5 3 2 1 1 1 82 80 1 9 9 82 81 1 83 30 30 3 29 30 74 10 9 1 1 1 10 80 79 2 1 3 7 5 78 5 5 1 4 4 4 3 5 82 5 165 165 1 165 1 164 1 164 2 166 159 7 113 2 112 2 92 1 1 24 11 7 24 3 22 5 23 4 111 47 111 93 12 72 58 24 92 11 91 55 60 59 58 1 1 88 4 90 91 14 12 7 6 7 1 7 7 7 14 14 14 9 1 14 14 14 13 6 11 198 1 1 11 295 293 292 113 177 176 3 153 28 3 5 1 6 6 1 6 259 260 254 8 3 6 6 251 247 254 1 254 2 2 253 3 11 11 1 11 11 11 3 8 3 11 11 11 11 11 10 1 1 1 1 1 480 483 1 483 373 1 11 121 2 107 31 116 5 3 6 247 2 4 5 249 1 1 251 5 3 4 180 73 171 174 174 173 1 2 1 1 29 2 1 1 5 1 2 2 1 1 293 291 5 294 118 182 89 88 89 80 1 7 7 7 1 1 1 13 16 6 259 7 1 3 1 2 26 26 183 12 1 8 10 21 87 20 262 88 162 12 253 2 1 255 59 164 71 17 259 262 256 2 10 1 9 9 9 1 8 2 2 11 6 6 4 76 73 1 2 74 74 73 36 54 30 67 67 67 68 74 108 99 16 51 16 51 20 51 51 2 14 44 1 106 80 4 76 1 12 14 42 31 14 27 27 14 15 43 43 17 6 1 3 7 34 3 1 1 17 18 1 12 1 1 9 10 10 10 2 8 10 5 4 1 27 79 1 78 1 79 140 140 5 136 134 43 42 42 49 2 45 79 24 16 14 25 51 48 3 51 12 1 11 64 15 1 52 23 31 6 1 34 11 18 1 17 9 1 7 6 6 6 1 6 27 17 17 3 1 9 262 238 55 84 4 28 1 74 11 70 30 73 40 32 7 6 7 9 1 1 7 65 18 35 59 85 6 4 1 2 3 2 2 4 5 1 1 1 3 35 35 2 1 4 6 1 1 1 1 2 1 1 2 1 1 2 1 1 1 85 33 4 56 57 90 15 13 3 12 2 14 14 12 12 10 8 1 8 1 8 1 7 2 8 1 8 1 9 35 35 36 36 36 36 34 1 36 1 36 31 34 36 11 36 1 36 4 36 2 36 1 36 1 36 1 36 1 6 5 6 22 2 21 1 22 1 22 23 23 16 6 23 23 91 92 257 89 57 56 73 20 2 93 92 68 1 68 67 17 19 32 32 1 1 12 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 // SPDX-License-Identifier: GPL-2.0 /* * Resizable virtual memory filesystem for Linux. * * Copyright (C) 2000 Linus Torvalds. * 2000 Transmeta Corp. * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. * Copyright (C) 2002-2011 Hugh Dickins. * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * * Extended attribute support for tmpfs: * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> * * tiny-shmem: * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com> */ #include <linux/fs.h> #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fileattr.h> #include <linux/filelock.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/shmem_fs.h> #include <linux/swap.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/fs_parser.h> #include <linux/swapfile.h> #include <linux/iversion.h> #include <linux/unicode.h> #include "swap.h" static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* * This virtual memory filesystem is heavily based on the ramfs. It * extends ramfs by the ability to use swap and honor resource limits * which makes it a completely usable filesystem. */ #include <linux/xattr.h> #include <linux/exportfs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/mman.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/folio_batch.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/leafops.h> #include <linux/mempolicy.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> #include <linux/rmap.h> #include <linux/uuid.h> #include <linux/quotaops.h> #include <linux/rcupdate_wait.h> #include <linux/uaccess.h> #include "internal.h" #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 /* Pretend that one inode + its dentry occupy this much memory */ #define BOGO_INODE_SIZE 1024 /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 /* * shmem_fallocate communicates with shmem_fault or shmem_writeout via * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ pgoff_t nr_unswapped; /* how often writeout refused to swap out */ }; struct shmem_options { unsigned long long blocks; unsigned long long inodes; struct mempolicy *mpol; kuid_t uid; kgid_t gid; umode_t mode; bool full_inums; int huge; int seen; bool noswap; unsigned short quota_types; struct shmem_quota_limits qlimits; #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *encoding; bool strict_encoding; #endif #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_QUOTA 16 }; #ifdef CONFIG_TRANSPARENT_HUGEPAGE static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { return totalram_pages() / 2; } static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); return min3(nr_pages - totalhigh_pages(), nr_pages / 2, ULONG_MAX / BOGO_INODE_SIZE); } #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; } /* * shmem_file_setup pre-accounts the whole fixed size of a VM object, * for shared memory and for shared anonymous (/dev/zero) mappings * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), * consistent with the pre-accounting of private mappings ... */ static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & SHMEM_F_NORESERVE) ? 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { if (!(flags & SHMEM_F_NORESERVE)) vm_unacct_memory(VM_ACCT(size)); } static inline int shmem_reacct_size(unsigned long flags, loff_t oldsize, loff_t newsize) { if (!(flags & SHMEM_F_NORESERVE)) { if (VM_ACCT(newsize) > VM_ACCT(oldsize)) return security_vm_enough_memory_mm(current->mm, VM_ACCT(newsize) - VM_ACCT(oldsize)); else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); } return 0; } /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & SHMEM_F_NORESERVE)) return 0; return security_vm_enough_memory_mm(current->mm, pages * VM_ACCT(PAGE_SIZE)); } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & SHMEM_F_NORESERVE) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (!percpu_counter_limited_add(&sbinfo->used_blocks, sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); if (err) { percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) goto unacct; } return 0; unacct: shmem_unacct_blocks(info->flags, pages); return err; } static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); } static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); static DEFINE_SPINLOCK(shmem_swaplist_lock); #ifdef CONFIG_TMPFS_QUOTA static int shmem_enable_quotas(struct super_block *sb, unsigned short quota_types) { int type, err = 0; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < SHMEM_MAXQUOTAS; type++) { if (!(quota_types & (1 << type))) continue; err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); if (err) goto out_err; } return 0; out_err: pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", type, err); for (type--; type >= 0; type--) dquot_quota_off(sb, type); return err; } static void shmem_disable_quotas(struct super_block *sb) { int type; for (type = 0; type < SHMEM_MAXQUOTAS; type++) dquot_quota_off(sb, type); } static struct dquot __rcu **shmem_get_dquots(struct inode *inode) { return SHMEM_I(inode)->i_dquot; } #endif /* CONFIG_TMPFS_QUOTA */ /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. * * It may also be called when making a hard link to permit the space needed by * each dentry. However, in that case, no new inode number is needed since that * internally draws from another pool of inode numbers (currently global * get_next_ino()). This case is indicated by passing NULL as inop. */ #define SHMEM_INO_BATCH 1024 static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) ino = sbinfo->next_ino++; if (unlikely(!sbinfo->full_inums && ino > UINT_MAX)) { /* * Emulate get_next_ino uint wraparound for * compatibility */ if (IS_ENABLED(CONFIG_64BIT)) pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", __func__, MINOR(sb->s_dev)); sbinfo->next_ino = 1; ino = sbinfo->next_ino++; } *inop = ino; } raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it * doesn't hold stat_lock in shmem_reserve_inode since * max_inodes is always 0, and is called from potentially * unknown contexts. As such, use a per-cpu batched allocator * which doesn't require the per-sb stat_lock unless we are at * the batch boundary. * * We don't need to worry about inode{32,64} since SB_KERNMOUNT * shmem mounts are not exposed to userspace, so we don't need * to worry about things like glibc compatibility. */ ino_t *next_ino; next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } *inop = ino; *next_ino = ++ino; put_cpu(); } return 0; } static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * @alloced: the change in number of pages allocated to inode * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) * * Return: true if swapped was incremented from 0, for shmem_writeout(). */ bool shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); bool first_swapped = false; long freed; spin_lock(&info->lock); info->alloced += alloced; info->swapped += swapped; freed = info->alloced - info->swapped - READ_ONCE(inode->i_mapping->nrpages); /* * Special case: whereas normally shmem_recalc_inode() is called * after i_mapping->nrpages has already been adjusted (up or down), * shmem_writeout() has to raise swapped before nrpages is lowered - * to stop a racing shmem_recalc_inode() from thinking that a page has * been freed. Compensate here, to avoid the need for a followup call. */ if (swapped > 0) { if (info->swapped == swapped) first_swapped = true; freed += swapped; } if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); /* The quota case may block */ if (freed > 0) shmem_inode_unacct_blocks(inode, freed); return first_swapped; } bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ xa_lock_irq(&mapping->i_pages); mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ shmem_recalc_inode(inode, 0, 0); } /* * Replace item expected in xarray by a new item, while holding xa_lock. */ static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); item = xas_load(&xas); if (item != expected) return -ENOENT; xas_store(&xas, replacement); return 0; } /* * Sometimes, before we decide whether to proceed or to fail, we must check * that an entry was not already brought back or split by a racing thread. * * Checking folio is not enough: by the time a swapcache folio is locked, it * might be reused, and again be swapcache, using the same swap as before. * Returns the swap entry's order if it still presents, else returns -1. */ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { XA_STATE(xas, &mapping->i_pages, index); int ret = -1; void *entry; rcu_read_lock(); do { entry = xas_load(&xas); if (entry == swp_to_radix_entry(swap)) ret = xas_get_order(&xas); } while (xas_retry(&xas, entry)); rcu_read_unlock(); return ret; } /* * Definitions for "huge tmpfs": tmpfs mounted with the huge= option * * SHMEM_HUGE_NEVER: * disables huge pages for the mount; * SHMEM_HUGE_ALWAYS: * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, * also respect madvise() hints; * SHMEM_HUGE_ADVISE: * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 #define SHMEM_HUGE_ALWAYS 1 #define SHMEM_HUGE_WITHIN_SIZE 2 #define SHMEM_HUGE_ADVISE 3 /* * Special values. * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: * * SHMEM_HUGE_DENY: * disables huge on shm_mnt and all mounts, for emergency use; * SHMEM_HUGE_FORCE: * enables huge on shm_mnt and all mounts, w/o needing option, for testing; * */ #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_NEVER) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ALWAYS) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ALWAYS #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_WITHIN_SIZE) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_SHMEM_HUGE_ADVISE) #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_ADVISE #else #define SHMEM_HUGE_DEFAULT SHMEM_HUGE_NEVER #endif static int shmem_huge __read_mostly = SHMEM_HUGE_DEFAULT; #undef SHMEM_HUGE_DEFAULT #if defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_NEVER) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ALWAYS) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ALWAYS #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_WITHIN_SIZE) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_WITHIN_SIZE #elif defined(CONFIG_TRANSPARENT_HUGEPAGE_TMPFS_HUGE_ADVISE) #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_ADVISE #else #define TMPFS_HUGE_DEFAULT SHMEM_HUGE_NEVER #endif static int tmpfs_huge __read_mostly = TMPFS_HUGE_DEFAULT; #undef TMPFS_HUGE_DEFAULT static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) { pgoff_t aligned_index; unsigned long order; loff_t i_size; order = highest_order(within_size_orders); while (within_size_orders) { aligned_index = round_up(index + 1, 1 << order); i_size = max(write_end, i_size_read(inode)); i_size = round_up(i_size, PAGE_SIZE); if (i_size >> PAGE_SHIFT >= aligned_index) return within_size_orders; order = next_order(&within_size_orders, order); } return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; if (!S_ISREG(inode->i_mode)) return 0; if (shmem_huge == SHMEM_HUGE_DENY) return 0; if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) return maybe_pmd_order; /* * The huge order allocation for anon shmem is controlled through * the mTHP interface, so we still use PMD-sized huge order to * check whether global control is enabled. * * For tmpfs with 'huge=always' or 'huge=within_size' mount option, * we will always try PMD-sized order first. If that failed, it will * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: within_size_orders = shmem_get_orders_within_size(inode, THP_ORDERS_ALL_FILE_DEFAULT, index, write_end); if (within_size_orders > 0) return within_size_orders; fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) return THP_ORDERS_ALL_FILE_DEFAULT; fallthrough; default: return 0; } } static int shmem_parse_huge(const char *str) { int huge; if (!str) return -EINVAL; if (!strcmp(str, "never")) huge = SHMEM_HUGE_NEVER; else if (!strcmp(str, "always")) huge = SHMEM_HUGE_ALWAYS; else if (!strcmp(str, "within_size")) huge = SHMEM_HUGE_WITHIN_SIZE; else if (!strcmp(str, "advise")) huge = SHMEM_HUGE_ADVISE; else if (!strcmp(str, "deny")) huge = SHMEM_HUGE_DENY; else if (!strcmp(str, "force")) huge = SHMEM_HUGE_FORCE; else return -EINVAL; if (!has_transparent_hugepage() && huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) return -EINVAL; /* Do not override huge allocation policy with non-PMD sized mTHP */ if (huge == SHMEM_HUGE_FORCE && huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) return -EINVAL; return huge; } #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { case SHMEM_HUGE_NEVER: return "never"; case SHMEM_HUGE_ALWAYS: return "always"; case SHMEM_HUGE_WITHIN_SIZE: return "within_size"; case SHMEM_HUGE_ADVISE: return "advise"; case SHMEM_HUGE_DENY: return "deny"; case SHMEM_HUGE_FORCE: return "force"; default: VM_BUG_ON(1); return "bad_val"; } } #endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { LIST_HEAD(list), *pos, *next; struct inode *inode; struct shmem_inode_info *info; struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; unsigned long split = 0, freed = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; spin_lock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &sbinfo->shrinklist) { info = list_entry(pos, struct shmem_inode_info, shrinklist); /* pin the inode */ inode = igrab(&info->vfs_inode); /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); goto next; } list_move(&info->shrinklist, &list); next: sbinfo->shrinklist_len--; if (!--batch) break; } spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &list) { pgoff_t next, end; loff_t i_size; int ret; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_free && freed >= nr_to_free) goto move_back; i_size = i_size_read(inode); folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE); if (!folio || xa_is_value(folio)) goto drop; /* No large folio at the end of the file: nothing to split */ if (!folio_test_large(folio)) { folio_put(folio); goto drop; } /* Check if there is anything to gain from splitting */ next = folio_next_index(folio); end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE)); if (end <= folio->index || end >= next) { folio_put(folio); goto drop; } /* * Move the inode on the list back to shrinklist if we failed * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!folio_trylock(folio)) { folio_put(folio); goto move_back; } ret = split_folio(folio); folio_unlock(folio); folio_put(folio); /* If split failed move the inode on the list back to shrinklist */ if (ret) goto move_back; freed += next - end; split++; drop: list_del_init(&info->shrinklist); goto put; move_back: /* * Make sure the inode is either on the global list or deleted * from any local list before iput() since it could be deleted * in another thread once we put the inode (then the local list * is corrupted). */ spin_lock(&sbinfo->shrinklist_lock); list_move(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; spin_unlock(&sbinfo->shrinklist_lock); put: iput(inode); } return split; } static long shmem_unused_huge_scan(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (!READ_ONCE(sbinfo->shrinklist_len)) return SHRINK_STOP; return shmem_unused_huge_shrink(sbinfo, sc, 0); } static long shmem_unused_huge_count(struct super_block *sb, struct shrink_control *sc) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_free) { return 0; } static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, vm_flags_t vm_flags) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static void shmem_update_stats(struct folio *folio, int nr_pages) { if (folio_test_pmd_mappable(folio)) lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages); lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages); } /* * Somewhat like filemap_add_folio, but error if expected item has gone. */ int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); unsigned long nr = folio_nr_pages(folio); swp_entry_t iter, swap; void *entry; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); folio_ref_add(folio, nr); folio->mapping = mapping; folio->index = index; gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); swap = radix_to_swp_entry(expected); do { iter = swap; xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { /* * The range must either be empty, or filled with * expected swap entries. Shmem swap entries are never * partially freed without split of both entry and * folio, so there shouldn't be any holes. */ if (!expected || entry != swp_to_radix_entry(iter)) { xas_set_err(&xas, -EEXIST); goto unlock; } iter.val += 1 << xas_get_order(&xas); } if (expected && iter.val - nr != swap.val) { xas_set_err(&xas, -EEXIST); goto unlock; } xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; shmem_update_stats(folio, nr); mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { folio->mapping = NULL; folio_ref_sub(folio, nr); return xas_error(&xas); } return 0; } /* * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { struct address_space *mapping = folio->mapping; long nr = folio_nr_pages(folio); int error; xa_lock_irq(&mapping->i_pages); error = shmem_replace_entry(mapping, folio->index, folio, radswap); folio->mapping = NULL; mapping->nrpages -= nr; shmem_update_stats(folio, -nr); xa_unlock_irq(&mapping->i_pages); folio_put_refs(folio, nr); BUG_ON(error); } /* * Remove swap entry from page cache, free the swap and its page cache. Returns * the number of pages being freed. 0 means entry not found in XArray (0 pages * being freed). */ static long shmem_free_swap(struct address_space *mapping, pgoff_t index, pgoff_t end, void *radswap) { XA_STATE(xas, &mapping->i_pages, index); unsigned int nr_pages = 0; pgoff_t base; void *entry; xas_lock_irq(&xas); entry = xas_load(&xas); if (entry == radswap) { nr_pages = 1 << xas_get_order(&xas); base = round_down(xas.xa_index, nr_pages); if (base < index || base + nr_pages - 1 > end) nr_pages = 0; else xas_store(&xas, NULL); } xas_unlock_irq(&xas); if (nr_pages) swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages); return nr_pages; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); xas_for_each(&xas, folio, max) { if (xas_retry(&xas, folio)) continue; if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return swapped << PAGE_SHIFT; } /* * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; unsigned long swapped; /* Be careful as we don't hold info->lock */ swapped = READ_ONCE(info->swapped); /* * The easier cases are when the shmem object has nothing in swap, or * the vma maps it whole. Then we can simply use the stats that we * already track. */ if (!swapped) return 0; if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ return shmem_partial_swap_usage(mapping, vma->vm_pgoff, vma->vm_pgoff + vma_pages(vma)); } /* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) { struct folio_batch fbatch; pgoff_t index = 0; folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ while (!mapping_unevictable(mapping) && filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { check_move_unevictable_folios(&fbatch); folio_batch_release(&fbatch); cond_resched(); } } static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; /* * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated folios as holes. */ folio = filemap_get_entry(inode->i_mapping, index); if (!folio) return folio; if (!xa_is_value(folio)) { folio_lock(folio); if (folio->mapping == inode->i_mapping) return folio; /* The folio has been swapped out */ folio_unlock(folio); folio_put(folio); } /* * But read a folio back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ folio = NULL; shmem_get_folio(inode, index, 0, &folio, SGP_READ); return folio; } /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, bool unfalloc) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t indices[FOLIO_BATCH_SIZE]; struct folio *folio; bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; if (lend == -1) end = -1; /* unsigned, so actually very big */ if (info->fallocend > start && info->fallocend <= end && !unfalloc) info->fallocend = start; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += shmem_free_swap(mapping, indices[i], end - 1, folio); continue; } if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * When undoing a failed fallocate, we want none of the partial folio * zeroing and splitting below, but shall want to truncate the whole * folio when !uptodate indicates that it was added by this fallocate, * even when [lstart, lend] covers only a part of the folio. */ if (unfalloc) goto whole_folios; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { same_folio = lend < folio_next_pos(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); if (folio) { folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } whole_folios: index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; /* But if truncating, restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; if (xa_is_value(folio)) { int order; long swaps_freed; if (unfalloc) continue; swaps_freed = shmem_free_swap(mapping, indices[i], end - 1, folio); if (!swaps_freed) { pgoff_t base = indices[i]; order = shmem_confirm_swap(mapping, indices[i], radix_to_swp_entry(folio)); /* * If found a large swap entry cross the end or start * border, skip it as the truncate_inode_partial_folio * above should have at least zerod its content once. */ if (order > 0) { base = round_down(base, 1 << order); if (base < start || base + (1 << order) > end) continue; } /* Swap was replaced by page or extended, retry */ index = base; break; } nr_swaps_freed += swaps_freed; continue; } folio_lock(folio); if (!unfalloc || !folio_test_uptodate(folio)) { if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); if (!folio_test_large(folio)) { truncate_inode_folio(mapping, folio); } else if (truncate_inode_partial_folio(folio, lstart, lend)) { /* * If we split a page, reset the loop so * that we pick up the new sub pages. * Otherwise the THP was entirely * dropped or the target range was * zeroed, so just continue the loop as * is. */ if (!folio_test_large(folio)) { folio_unlock(folio); index = start; break; } } } folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); } shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); if (info->alloced - info->swapped != inode->i_mapping->nrpages) shmem_recalc_inode(inode, 0, 0); if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (info->fsflags & FS_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = info->i_crtime.tv_sec; stat->btime.tv_nsec = info->i_crtime.tv_nsec; } return 0; } static int shmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; bool update_mtime = false; bool update_ctime = true; error = setattr_prepare(idmap, dentry, attr); if (error) return error; if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) { if ((inode->i_mode ^ attr->ia_mode) & 0111) { return -EPERM; } } if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; if (newsize != oldsize) { if (info->flags & SHMEM_F_MAPPING_FROZEN) return -EPERM; error = shmem_reacct_size(SHMEM_I(inode)->flags, oldsize, newsize); if (error) return error; i_size_write(inode, newsize); update_mtime = true; } else { update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); if (info->alloced) shmem_truncate_range(inode, newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } } if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } /* Transfer quota accounting */ if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); if (error) return error; } setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); if (!error && update_ctime) { inode_set_ctime_current(inode); if (update_mtime) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); inode_inc_iversion(inode); } return error; } static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); if (!list_empty(&info->shrinklist)) { list_del_init(&info->shrinklist); sbinfo->shrinklist_len--; } spin_unlock(&sbinfo->shrinklist_lock); } while (!list_empty(&info->swaplist)) { /* Wait while shmem_unuse() is scanning this inode... */ wait_var_event(&info->stop_eviction, !atomic_read(&info->stop_eviction)); spin_lock(&shmem_swaplist_lock); /* ...but beware of the race if we peeked too early */ if (!atomic_read(&info->stop_eviction)) list_del_init(&info->swaplist); spin_unlock(&shmem_swaplist_lock); } } if (info->xattrs) { simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL); kfree(info->xattrs); } shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); dquot_drop(inode); #endif } static unsigned int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, struct folio_batch *fbatch, pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; swp_entry_t entry; rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (!xa_is_value(folio)) continue; entry = radix_to_swp_entry(folio); /* * swapin error entries can be found in the mapping. But they're * deliberately ignored here as we've done everything we can do. */ if (swp_type(entry) != type) continue; indices[folio_batch_count(fbatch)] = xas.xa_index; if (!folio_batch_add(fbatch, folio)) break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); return folio_batch_count(fbatch); } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ static int shmem_unuse_swap_entries(struct inode *inode, struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); ret++; } if (error == -ENOMEM) break; error = 0; } return error ? error : ret; } /* * If swap found in inode, free it and move page from swapcache to filecache. */ static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct folio_batch fbatch; pgoff_t indices[FOLIO_BATCH_SIZE]; int ret = 0; do { folio_batch_init(&fbatch); if (!shmem_find_swap_entries(mapping, start, &fbatch, indices, type)) { ret = 0; break; } ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; } /* * Read all the shared memory data that resides in the swap * device 'type' back into memory, so the swap device can be * unused. */ int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; if (list_empty(&shmem_swaplist)) return 0; spin_lock(&shmem_swaplist_lock); start_over: list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } /* * Drop the swaplist mutex while searching the inode for swap; * but before doing so, make sure shmem_evict_inode() will not * remove placeholder inode from swaplist, nor let it be freed * (igrab() would protect from unlink, but not from unmount). */ atomic_inc(&info->stop_eviction); spin_unlock(&shmem_swaplist_lock); error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); spin_lock(&shmem_swaplist_lock); if (atomic_dec_and_test(&info->stop_eviction)) wake_up_var(&info->stop_eviction); if (error) break; if (list_empty(&info->swaplist)) goto start_over; next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); } spin_unlock(&shmem_swaplist_lock); return error; } /** * shmem_writeout - Write the folio to swap * @folio: The folio to write * @plug: swap plug * @folio_list: list to put back folios on split * * Move the folio from the page cache to the swap cache. */ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list) { struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); pgoff_t index; int nr_pages; bool split = false; if ((info->flags & SHMEM_F_LOCKED) || sbinfo->noswap) goto redirty; if (!total_swap_pages) goto redirty; /* * If CONFIG_THP_SWAP is not enabled, the large folio should be * split when swapping. * * And shrinkage of pages beyond i_size does not split swap, so * swapout of a large folio crossing i_size needs to split too * (unless fallocate has been used to preallocate beyond EOF). */ if (folio_test_large(folio)) { index = shmem_fallocend(inode, DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE)); if ((index > folio->index && index < folio_next_index(folio)) || !IS_ENABLED(CONFIG_THP_SWAP)) split = true; } if (split) { int order; try_split: order = folio_order(folio); /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); if (split_folio_to_list(folio, folio_list)) goto redirty; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order >= HPAGE_PMD_ORDER) { count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } #endif count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); folio_clear_dirty(folio); } index = folio->index; nr_pages = folio_nr_pages(folio); /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a * fallocated folio arriving here is now to initialize it and write it. * * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So * reactivate the folio, and let shmem_fallocate() quit when too many. */ if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped += nr_pages; else shmem_falloc = NULL; spin_unlock(&inode->i_lock); if (shmem_falloc) goto redirty; } folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } if (!folio_alloc_swap(folio)) { bool first_swapped = shmem_recalc_inode(inode, 0, nr_pages); int error; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the folio is * removed from page cache, when its pagelock no longer * protects the inode from eviction. And do it now, after * we've incremented swapped, because shmem_unuse() will * prune a !swapped inode from the swaplist. */ if (first_swapped) { spin_lock(&shmem_swaplist_lock); if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); spin_unlock(&shmem_swaplist_lock); } folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); error = swap_writeout(folio, plug); if (error != AOP_WRITEPAGE_ACTIVATE) { /* folio has been unlocked */ return error; } /* * The intention here is to avoid holding on to the swap when * zswap was unable to compress and unable to writeback; but * it will be appropriate if other reactivate cases are added. */ error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(folio->swap), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); folio_put_swap(folio, NULL); } /* * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) goto try_split; redirty: folio_mark_dirty(folio); return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ } EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; if (!mpol || mpol->mode == MPOL_DEFAULT) return; /* show nothing */ mpol_to_str(buffer, sizeof(buffer), mpol); seq_printf(seq, ",mpol=%s", buffer); } static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } #else /* !CONFIG_NUMA || !CONFIG_TMPFS */ static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx); static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); folio = swap_cluster_readahead(swap, gfp, mpol, ilx); mpol_cond_put(mpol); return folio; } /* * Make sure huge_gfp is always more limited than limit_gfp. * Some of the flags set permissions, while others set limitations. */ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) { gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); /* Allow allocations only from the originally specified zones. */ result |= zoneflags; /* * Minimize the result gfp by taking the union with the deny flags, * and the intersection of the allow flags. */ result |= (limit_gfp & denyflags); result |= (huge_gfp & limit_gfp) & allowflags; return result; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_hpage_pmd_enabled(void) { if (shmem_huge == SHMEM_HUGE_DENY) return false; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) return true; if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && shmem_huge != SHMEM_HUGE_NEVER) return true; return false; } unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, shmem_huge_force, vma, vm_flags); /* Tmpfs huge pages allocation */ if (!vma || !vma_is_anon_shmem(vma)) return global_orders; /* * Following the 'deny' semantics of the top level, force the huge * option off from all mounts. */ if (shmem_huge == SHMEM_HUGE_DENY) return 0; /* * Only allow inherit orders if the top-level value is 'force', which * means non-PMD sized THP can not override 'huge' mount option now. */ if (shmem_huge == SHMEM_HUGE_FORCE) return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); if (global_orders > 0) mask |= READ_ONCE(huge_shmem_orders_inherit); return THP_ORDERS_ALL_FILE_DEFAULT & mask; } static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; pgoff_t aligned_index; unsigned long pages; int order; if (vma) { orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) return 0; } /* Find the highest order that can add into the page cache */ order = highest_order(orders); while (orders) { pages = 1UL << order; aligned_index = round_down(index, pages); /* * Check for conflict before waiting on a huge allocation. * Conflict might be that a huge page has just been allocated * and added to page cache by a racing thread, or that there * is already at least one small page in the huge extent. * Be careful to retry when appropriate, but not forever! * Elsewhere -EEXIST would be the right code, but not here. */ if (!xa_find(&mapping->i_pages, &aligned_index, aligned_index + pages - 1, XA_PRESENT)) break; order = next_order(&orders, order); } return orders; } #else static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf, struct address_space *mapping, pgoff_t index, unsigned long orders) { return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; struct folio *folio; mpol = shmem_get_pgoff_policy(info, index, order, &ilx); folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); return folio; } static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf, gfp_t gfp, struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, unsigned long orders) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); unsigned long suitable_orders = 0; struct folio *folio = NULL; pgoff_t aligned_index; long pages; int error, order; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) orders = 0; if (orders > 0) { suitable_orders = shmem_suitable_orders(inode, vmf, mapping, index, orders); order = highest_order(suitable_orders); while (suitable_orders) { pages = 1UL << order; aligned_index = round_down(index, pages); folio = shmem_alloc_folio(gfp, order, info, aligned_index); if (folio) { index = aligned_index; goto allocated; } if (pages == HPAGE_PMD_NR) count_vm_event(THP_FILE_FALLBACK); count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK); order = next_order(&suitable_orders, order); } } else { pages = 1; folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); allocated: __folio_set_locked(folio); __folio_set_swapbacked(folio); gfp &= GFP_RECLAIM_MASK; error = mem_cgroup_charge(folio, fault_mm, gfp); if (error) { if (xa_find(&mapping->i_pages, &index, index + pages - 1, XA_PRESENT)) { error = -EEXIST; } else if (pages > 1) { if (pages == HPAGE_PMD_NR) { count_vm_event(THP_FILE_FALLBACK); count_vm_event(THP_FILE_FALLBACK_CHARGE); } count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE); } goto unlock; } error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); if (error) goto unlock; error = shmem_inode_acct_blocks(inode, pages); if (error) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); long freed; /* * Try to reclaim some space by splitting a few * large folios beyond i_size on the filesystem. */ shmem_unused_huge_shrink(sbinfo, NULL, pages); /* * And do a shmem_recalc_inode() to account for freed pages: * except our folio is there in cache, so not quite balanced. */ spin_lock(&info->lock); freed = pages + info->alloced - info->swapped - READ_ONCE(mapping->nrpages); if (freed > 0) info->alloced -= freed; spin_unlock(&info->lock); if (freed > 0) shmem_inode_unacct_blocks(inode, freed); error = shmem_inode_acct_blocks(inode, pages); if (error) { filemap_remove_folio(folio); goto unlock; } } shmem_recalc_inode(inode, pages, 0); folio_add_lru(folio); return folio; unlock: folio_unlock(folio); folio_put(folio); return ERR_PTR(error); } static struct folio *shmem_swap_alloc_folio(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, swp_entry_t entry, int order, gfp_t gfp) { struct shmem_inode_info *info = SHMEM_I(inode); struct folio *new, *swapcache; int nr_pages = 1 << order; gfp_t alloc_gfp = gfp; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (WARN_ON_ONCE(order)) return ERR_PTR(-EINVAL); } else if (order) { /* * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. * Any existing sub folio in the swap cache also blocks * mTHP swapin. */ if ((vma && unlikely(userfaultfd_armed(vma))) || !zswap_never_enabled() || non_swapcache_batch(entry, nr_pages) != nr_pages) goto fallback; alloc_gfp = limit_gfp_mask(vma_thp_gfp_mask(vma), gfp); } retry: new = shmem_alloc_folio(alloc_gfp, order, info, index); if (!new) { new = ERR_PTR(-ENOMEM); goto fallback; } if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL, alloc_gfp, entry)) { folio_put(new); new = ERR_PTR(-ENOMEM); goto fallback; } swapcache = swapin_folio(entry, new); if (swapcache != new) { folio_put(new); if (!swapcache) { /* * The new folio is charged already, swapin can * only fail due to another raced swapin. */ new = ERR_PTR(-EEXIST); goto fallback; } } return swapcache; fallback: /* Order 0 swapin failed, nothing to fallback to, abort */ if (!order) return new; entry.val += index - round_down(index, nr_pages); alloc_gfp = gfp; nr_pages = 1; order = 0; goto retry; } /* * When a page is moved from swapcache to shmem filecache (either by the * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), * we may need to copy to a suitable page before moving to filecache. * * In a future release, this may well be extended to respect cpuset and * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { return folio_zonenum(folio) > gfp_zone(gfp); } static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; int nr_pages = folio_nr_pages(old); int error = 0; /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (nr_pages > 1) { gfp_t huge_gfp = vma_thp_gfp_mask(vma); gfp = limit_gfp_mask(huge_gfp, gfp); } #endif new = shmem_alloc_folio(gfp, folio_order(old), info, index); if (!new) return -ENOMEM; folio_ref_add(new, nr_pages); folio_copy(new, old); flush_dcache_folio(new); __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); new->swap = entry; folio_set_swapcache(new); ci = swap_cluster_get_and_lock_irq(old); __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; folio_clear_swapcache(old); old->private = NULL; folio_unlock(old); /* * The old folio are removed from swap cache, drop the 'nr_pages' * reference, as well as one temporary reference getting from swap * cache. */ folio_put_refs(old, nr_pages + 1); return error; } static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; swp_entry_t swapin_error; void *old; int nr_pages; swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); if (old != swp_to_radix_entry(swap)) return; nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); folio_put_swap(folio, NULL); swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, swp_entry_t swap, gfp_t gfp) { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ gfp &= GFP_RECLAIM_MASK; for (;;) { void *old = NULL; int cur_order; pgoff_t swap_index; xas_lock_irq(&xas); old = xas_load(&xas); if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) { xas_set_err(&xas, -EEXIST); goto unlock; } cur_order = xas_get_order(&xas); if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { pgoff_t aligned_index = round_down(index, 1 << cur_order); pgoff_t swap_offset = aligned_index - swap_index; xas_set_order(&xas, index, split_order); xas_try_split(&xas, old, cur_order); if (xas_error(&xas)) goto unlock; /* * Re-set the swap entry after splitting, and the swap * offset of the original large entry must be continuous. */ for (i = 0; i < 1 << cur_order; i += (1 << split_order)) { swp_entry_t tmp; tmp = swp_entry(swp_type(swap), swp_offset(swap) + swap_offset + i); __xa_store(&mapping->i_pages, aligned_index + i, swp_to_radix_entry(tmp), 0); } cur_order = split_order; split_order = xas_try_split_min_order(split_order); } unlock: xas_unlock_irq(&xas); if (!xas_nomem(&xas, gfp)) break; } if (xas_error(&xas)) return xas_error(&xas); return 0; } /* * Swap in the folio pointed to by *foliop. * Caller has to make sure that *foliop contains a valid swapped folio. * Returns 0 and the folio in foliop if success. On failure, returns the * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL; struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swap; softleaf_t index_entry; struct swap_info_struct *si; struct folio *folio = NULL; int error, nr_pages, order; pgoff_t offset; VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); index_entry = radix_to_swp_entry(*foliop); swap = index_entry; *foliop = NULL; if (softleaf_is_poison_marker(index_entry)) return -EIO; si = get_swap_device(index_entry); order = shmem_confirm_swap(mapping, index, index_entry); if (unlikely(!si)) { if (order < 0) return -EEXIST; else return -EINVAL; } if (unlikely(order < 0)) { put_swap_device(si); return -EEXIST; } /* index may point to the middle of a large entry, get the sub entry */ if (order) { offset = index - round_down(index, 1 << order); swap = swp_entry(swp_type(swap), swp_offset(swap) + offset); } /* Look it up and read it in.. */ folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ folio = shmem_swap_alloc_folio(inode, vma, index, index_entry, order, gfp); if (IS_ERR(folio)) { error = PTR_ERR(folio); folio = NULL; goto failed; } } else { /* Cached swapin only supports order 0 folio */ folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; } } if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } } else { swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { /* * Swapin may get smaller folios due to various reasons: * It may fallback to order 0 due to memory pressure or race, * swap readahead may swap in order 0 folios into swapcache * asynchronously, while the shmem mapping can still stores * large swap entries. In such cases, we should split the * large swap entry to prevent possible data corruption. */ error = shmem_split_large_entry(inode, index, index_entry, gfp); if (error) goto failed_nolock; } /* * If the folio is large, round down swap and index by folio size. * No matter what race occurs, the swap layer ensures we either get * a valid folio that has its swap entry aligned by size, or a * temporarily invalid one which we'll abort very soon and retry. * * shmem_add_to_page_cache ensures the whole range contains expected * entries and prevents any corruption, so any race split is fine * too, it will succeed as long as the entries are still there. */ nr_pages = folio_nr_pages(folio); if (nr_pages > 1) { swap.val = round_down(swap.val, nr_pages); index = round_down(index, nr_pages); } /* * We have to do this with the folio locked to prevent races. * The shmem_confirm_swap below only checks if the first swap * entry matches the folio, that's enough to ensure the folio * is not used outside of shmem, as shmem swap entries * and swap cache folios are never partially freed. */ folio_lock(folio); if (!folio_matches_swap_entry(folio, swap) || shmem_confirm_swap(mapping, index, swap) < 0) { error = -EEXIST; goto unlock; } if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } folio_wait_writeback(folio); /* * Some architectures may have to restore extra metadata to the * folio after reading from swap. */ arch_swap_restore(folio_swap(swap, folio), folio); if (shmem_should_replace_folio(folio, gfp)) { error = shmem_replace_folio(&folio, gfp, info, index, vma); if (error) goto failed; } error = shmem_add_to_page_cache(folio, mapping, index, swp_to_radix_entry(swap), gfp); if (error) goto failed; shmem_recalc_inode(inode, 0, -nr_pages); if (sgp == SGP_WRITE) folio_mark_accessed(folio); folio_put_swap(folio, NULL); swap_cache_del_folio(folio); folio_mark_dirty(folio); put_swap_device(si); *foliop = folio; return 0; failed: if (shmem_confirm_swap(mapping, index, swap) < 0) error = -EEXIST; if (error == -EIO) shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: if (folio) folio_unlock(folio); failed_nolock: if (folio) folio_put(folio); put_swap_device(si); return error; } /* * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct mm_struct *fault_mm; struct folio *folio; int error; bool alloced; unsigned long orders = 0; if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) return -EINVAL; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; alloced = false; fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(inode->i_mapping, index); if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; *foliop = folio; return error; } if (folio) { folio_lock(folio); /* Has the folio been truncated or swapped out? */ if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; } if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); folio_put(folio); } /* * SGP_READ: succeed on hole, with NULL folio, letting caller zero. * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) return -ENOENT; /* * Fast cache lookup and swap lookup did not find it: allocate. */ /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false); if (orders > 0) { gfp_t huge_gfp; huge_gfp = vma_thp_gfp_mask(vma); huge_gfp = limit_gfp_mask(huge_gfp, gfp); folio = shmem_alloc_and_add_folio(vmf, huge_gfp, inode, index, fault_mm, orders); if (!IS_ERR(folio)) { if (folio_test_pmd_mappable(folio)) count_vm_event(THP_FILE_ALLOC); count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC); goto alloced; } if (PTR_ERR(folio) == -EEXIST) goto repeat; } folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0); if (IS_ERR(folio)) { error = PTR_ERR(folio); if (error == -EEXIST) goto repeat; folio = NULL; goto unlock; } alloced: alloced = true; if (folio_test_large(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio)) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to * ->shrink_list in shmem_unused_huge_shrink() */ if (list_empty_careful(&info->shrinklist)) { list_add_tail(&info->shrinklist, &sbinfo->shrinklist); sbinfo->shrinklist_len++; } spin_unlock(&sbinfo->shrinklist_lock); } if (sgp == SGP_WRITE) folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* * Let SGP_WRITE caller clear ends if write does not fill folio; * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { long i, n = folio_nr_pages(folio); for (i = 0; i < n; i++) clear_highpage(folio_page(folio, i)); flush_dcache_folio(folio); folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } out: *foliop = folio; return 0; /* * Error recovery. */ unlock: if (alloced) filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } return error; } /** * shmem_get_folio - find, and lock a shmem folio. * @inode: inode to search * @index: the page index. * @write_end: end of a write, could extend inode size * @foliop: pointer to the folio if found * @sgp: SGP_* flags to control behavior * * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * * If the caller modifies data in the folio, it must call folio_mark_dirty() * before unlocking the folio to ensure that the folio is not reclaimed. * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *@foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * * Context: May sleep. * Return: 0 if successful, else a negative error code. */ int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the * target. */ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, * and the i_mmap tree grows ever slower to scan if new vmas are added. * * It does not matter if we sometimes reach this check just before the * hole-punch begins, so that one fault then races with the punch: * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) { struct shmem_falloc *shmem_falloc; struct file *fpin = NULL; vm_fault_t ret = 0; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; fpin = maybe_unlock_mmap_for_io(vmf, NULL); shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); schedule(); /* * shmem_falloc_waitq points into the shmem_fallocate() * stack of the hole-punching task: shmem_falloc_waitq * is usually invalid by the time we reach here, but * finish_wait() does not dereference it in that case; * though i_lock needed lest racing with wake_up_all(). */ spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); } spin_unlock(&inode->i_lock); if (fpin) { fput(fpin); ret = VM_FAULT_RETRY; } return ret; } static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; int err; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { ret = shmem_falloc_wait(vmf, inode); if (ret) return ret; } WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); ret |= VM_FAULT_LOCKED; } return ret; } unsigned long shmem_get_unmapped_area(struct file *file, unsigned long uaddr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long addr; unsigned long offset; unsigned long inflated_len; unsigned long inflated_addr; unsigned long inflated_offset; unsigned long hpage_size; if (len > TASK_SIZE) return -ENOMEM; addr = mm_get_unmapped_area(file, uaddr, len, pgoff, flags); if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; if (addr & ~PAGE_MASK) return addr; if (addr > TASK_SIZE - len) return addr; if (shmem_huge == SHMEM_HUGE_DENY) return addr; if (flags & MAP_FIXED) return addr; /* * Our priority is to support MAP_SHARED mapped hugely; * and support MAP_PRIVATE mapped hugely too, until it is COWed. * But if caller specified an address hint and we allocated area there * successfully, respect that as before. */ if (uaddr == addr) return addr; hpage_size = HPAGE_PMD_SIZE; if (shmem_huge != SHMEM_HUGE_FORCE) { struct super_block *sb; unsigned long __maybe_unused hpage_orders; int order = 0; if (file) { VM_BUG_ON(file->f_op != &shmem_file_operations); sb = file_inode(file)->i_sb; } else { /* * Called directly from mm/mmap.c, or drivers/char/mem.c * for "/dev/zero", to create a shared anonymous object. */ if (IS_ERR(shm_mnt)) return addr; sb = shm_mnt->mnt_sb; /* * Find the highest mTHP order used for anonymous shmem to * provide a suitable alignment address. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE hpage_orders = READ_ONCE(huge_shmem_orders_always); hpage_orders |= READ_ONCE(huge_shmem_orders_within_size); hpage_orders |= READ_ONCE(huge_shmem_orders_madvise); if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) hpage_orders |= READ_ONCE(huge_shmem_orders_inherit); if (hpage_orders > 0) { order = highest_order(hpage_orders); hpage_size = PAGE_SIZE << order; } #endif } if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order) return addr; } if (len < hpage_size) return addr; offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1); if (offset && offset + len < 2 * hpage_size) return addr; if ((addr & (hpage_size - 1)) == offset) return addr; inflated_len = len + hpage_size - PAGE_SIZE; if (inflated_len > TASK_SIZE) return addr; if (inflated_len < len) return addr; inflated_addr = mm_get_unmapped_area(NULL, uaddr, inflated_len, 0, flags); if (IS_ERR_VALUE(inflated_addr)) return addr; if (inflated_addr & ~PAGE_MASK) return addr; inflated_offset = inflated_addr & (hpage_size - 1); inflated_addr += offset - inflated_offset; if (inflated_offset > offset) inflated_addr += hpage_size; if (inflated_addr > TASK_SIZE - len) return addr; return inflated_addr; } #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { struct inode *inode = file_inode(vma->vm_file); return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; /* * Bias interleave by inode number to distribute better across nodes; * but this interface is independent of which page order is used, so * supplies only that bias, letting caller apply the offset (adjusted * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). */ *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { struct mempolicy *mpol; /* Bias interleave by inode number to distribute better across nodes */ *ilx = info->vfs_inode.i_ino + (index >> order); mpol = mpol_shared_policy_lookup(&info->policy, index); return mpol ? mpol : get_task_policy(current); } #else static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, pgoff_t index, unsigned int order, pgoff_t *ilx) { *ilx = 0; return NULL; } #endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; /* * What serializes the accesses to info->flags? * ipc_lock_object() when called from shmctl_do_lock(), * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & SHMEM_F_LOCKED)) { if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= SHMEM_F_LOCKED; mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & SHMEM_F_LOCKED) && ucounts) { user_shm_unlock(inode->i_size, ucounts); info->flags &= ~SHMEM_F_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: return retval; } static int shmem_mmap_prepare(struct vm_area_desc *desc) { struct file *file = desc->file; struct inode *inode = file_inode(file); file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) desc->vm_ops = &shmem_vm_ops; else desc->vm_ops = &shmem_anon_vm_ops; return 0; } static int shmem_file_open(struct inode *inode, struct file *file) { file->f_mode |= FMODE_CAN_ODIRECT; return generic_file_open(inode, file); } #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); #if IS_ENABLED(CONFIG_UNICODE) /* * shmem_inode_casefold_flags - Deal with casefold file attribute flag * * The casefold file attribute needs some special checks. I can just be added to * an empty dir, and can't be removed from a non-empty dir. */ static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { unsigned int old = inode->i_flags; struct super_block *sb = inode->i_sb; if (fsflags & FS_CASEFOLD_FL) { if (!(old & S_CASEFOLD)) { if (!sb->s_encoding) return -EOPNOTSUPP; if (!S_ISDIR(inode->i_mode)) return -ENOTDIR; if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } *i_flags = *i_flags | S_CASEFOLD; } else if (old & S_CASEFOLD) { if (dentry && !simple_empty(dentry)) return -ENOTEMPTY; } return 0; } #else static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry, unsigned int *i_flags) { if (fsflags & FS_CASEFOLD_FL) return -EOPNOTSUPP; return 0; } #endif /* * chattr's fsflags are unrelated to extended attributes, * but tmpfs has chosen to enable them under the same config option. */ static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { unsigned int i_flags = 0; int ret; ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags); if (ret) return ret; if (fsflags & FS_NOATIME_FL) i_flags |= S_NOATIME; if (fsflags & FS_APPEND_FL) i_flags |= S_APPEND; if (fsflags & FS_IMMUTABLE_FL) i_flags |= S_IMMUTABLE; /* * But FS_NODUMP_FL does not require any action in i_flags. */ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD); return 0; } #else static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry) { } #define shmem_initxattrs NULL #endif static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) { return &SHMEM_I(inode)->dir_offsets; } static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; int err; err = shmem_reserve_inode(sb, &ino); if (err) return ERR_PTR(err); inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } inode->i_ino = ino; inode_init_owner(idmap, inode, dir, mode); inode->i_blocks = 0; simple_inode_init_ts(inode); inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; info->i_crtime = inode_get_mtime(inode); info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; if (info->fsflags) shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); /* Don't consider 'deny' for emergencies and 'force' for testing */ if (sbinfo->huge) mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, shmem_get_sbmpol(sbinfo)); break; case S_IFDIR: inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; inode->i_fop = &simple_offset_dir_operations; simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* * Must not load anything in the rbtree, * mpol_free_shared_policy will not be called. */ mpol_shared_policy_init(&info->policy, NULL); break; } lockdep_annotate_inode_mutex_key(inode); return inode; } #ifdef CONFIG_TMPFS_QUOTA static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { int err; struct inode *inode; inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); if (IS_ERR(inode)) return inode; err = dquot_initialize(inode); if (err) goto errout; err = dquot_alloc_inode(inode); if (err) { dquot_drop(inode); goto errout; } return inode; errout: inode->i_flags |= S_NOQUOTA; iput(inode); return ERR_PTR(err); } #else static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); } #endif /* CONFIG_TMPFS_QUOTA */ #ifdef CONFIG_USERFAULTFD static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, unsigned long addr) { struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); struct folio *folio; if (unlikely(pgoff >= DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) return NULL; folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) return NULL; if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { folio_put(folio); return NULL; } return folio; } static int shmem_mfill_filemap_add(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct inode *inode = file_inode(vma->vm_file); struct address_space *mapping = inode->i_mapping; pgoff_t pgoff = linear_page_index(vma, addr); gfp_t gfp = mapping_gfp_mask(mapping); int err; __folio_set_locked(folio); __folio_set_swapbacked(folio); err = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (err) goto err_unlock; if (shmem_inode_acct_blocks(inode, 1)) { err = -ENOMEM; goto err_delete_from_cache; } folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); return 0; err_delete_from_cache: filemap_remove_folio(folio); err_unlock: folio_unlock(folio); return err; } static void shmem_mfill_filemap_remove(struct folio *folio, struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); filemap_remove_folio(folio); shmem_recalc_inode(inode, 0, 0); folio_unlock(folio); } static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t pgoff) { struct folio *folio; int err; err = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); if (err) return ERR_PTR(err); return folio; } static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags) { return true; } static const struct vm_uffd_ops shmem_uffd_ops = { .can_userfault = shmem_can_userfault, .get_folio_noalloc = shmem_get_folio_noalloc, .alloc_folio = shmem_mfill_folio_alloc, .filemap_add = shmem_mfill_filemap_add, .filemap_remove = shmem_mfill_filemap_remove, }; #endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; } if (unlikely((info->flags & SHMEM_F_MAPPING_FROZEN) && pos + len > inode->i_size)) return -EPERM; ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE); if (ret) return ret; if (folio_contain_hwpoisoned_page(folio)) { folio_unlock(folio); folio_put(folio); return -EIO; } *foliop = folio; return 0; } static int shmem_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); if (!folio_test_uptodate(folio)) { if (copied < folio_size(folio)) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + copied, folio_size(folio)); } folio_mark_uptodate(folio); } folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; int error = 0; ssize_t retval = 0; for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); bool fallback_page_copy = false; size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_copy = true; } /* * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ i_size = i_size_read(inode); if (unlikely(iocb->ki_pos >= i_size)) { if (folio) folio_put(folio); break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); if (folio && likely(!fallback_page_copy)) fsize = folio_size(folio); else fsize = PAGE_SIZE; offset = iocb->ki_pos & (fsize - 1); nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_copy)) flush_dcache_folio(folio); else flush_dcache_page(page); } /* * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ if (likely(!fallback_page_copy)) ret = copy_folio_to_iter(folio, offset, nr, to); else ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but * clear_user() not so much, that it is noticeably * faster to copy the zero page instead of clearing. */ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { /* * But submitting the same page twice in a row to * splice() - or others? - can result in confusion: * so don't attempt that optimization on pipes etc. */ ret = iov_iter_zero(nr, to); } retval += ret; iocb->ki_pos += ret; if (!iov_iter_count(to)) break; if (ret < nr) { error = -EFAULT; break; } cond_resched(); } file_accessed(file); return retval ? retval : error; } static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto unlock; ret = file_remove_privs(file); if (ret) goto unlock; ret = file_update_time(file); if (ret) goto unlock; ret = generic_perform_write(iocb, from); unlock: inode_unlock(inode); return ret; } static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return true; } static void zero_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { } static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return false; } static const struct pipe_buf_operations zero_pipe_buf_ops = { .release = zero_pipe_buf_release, .try_steal = zero_pipe_buf_try_steal, .get = zero_pipe_buf_get, }; static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, loff_t fpos, size_t size) { size_t offset = fpos & ~PAGE_MASK; size = min_t(size_t, size, PAGE_SIZE - offset); if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { .ops = &zero_pipe_buf_ops, .page = ZERO_PAGE(0), .offset = offset, .len = size, }; pipe->head++; } return size; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); struct address_space *mapping = inode->i_mapping; struct folio *folio = NULL; size_t total_spliced = 0, used, npages, n, part; loff_t isize; int error = 0; /* Work out how much data we can actually add into the pipe */ used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); do { bool fallback_page_splice = false; struct page *page = NULL; pgoff_t index; size_t size; if (*ppos >= i_size_read(inode)) break; index = *ppos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } if (folio) { folio_unlock(folio); page = folio_file_page(folio, index); if (PageHWPoison(page)) { error = -EIO; break; } if (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)) fallback_page_splice = true; } /* * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; /* * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned * pages. */ size = len; if (unlikely(fallback_page_splice)) { size_t offset = *ppos & ~PAGE_MASK; size = umin(size, PAGE_SIZE - offset); } part = min_t(loff_t, isize - *ppos, size); if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ if (mapping_writably_mapped(mapping)) { if (likely(!fallback_page_splice)) flush_dcache_folio(folio); else flush_dcache_page(page); } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can * now splice it into the pipe. */ n = splice_folio_into_pipe(pipe, folio, *ppos, part); folio_put(folio); folio = NULL; } else { n = splice_zeropage_into_pipe(pipe, *ppos, part); } if (!n) break; len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; if (pipe_is_full(pipe)) break; cond_resched(); } while (len); if (folio) folio_put(folio); file_accessed(in); return total_spliced ? total_spliced : error; } static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); if (offset < 0) return -ENXIO; inode_lock(inode); /* We're holding i_rwsem so we can access i_size directly */ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); return offset; } static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; inode_lock(inode); if (info->flags & SHMEM_F_MAPPING_FROZEN) { error = -EPERM; goto out; } if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ spin_lock(&inode->i_lock); inode->i_private = NULL; wake_up_all(&shmem_falloc_waitq); WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); spin_unlock(&inode->i_lock); error = 0; goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); if (error) goto out; if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { error = -EPERM; goto out; } start = offset >> PAGE_SHIFT; end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; goto out; } shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; shmem_falloc.nr_unswapped = 0; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); /* * info->fallocend is only relevant when huge pages might be * involved: to prevent split_huge_page() freeing fallocated * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. */ undo_fallocend = info->fallocend; if (info->fallocend < end) info->fallocend = end; for (index = start; index < end; ) { struct folio *folio; /* * Check for fatal signal so that we abort early in OOM * situations. We don't want to abort in case of non-fatal * signals as large fallocate can take noticeable time and * e.g. periodic timers may result in fallocate constantly * restarting. */ if (fatal_signal_pending(current)) error = -EINTR; else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else error = shmem_get_folio(inode, index, offset + len, &folio, SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, ((loff_t)index << PAGE_SHIFT) - 1, true); } goto undone; } /* * Here is a more important optimization than it appears: * a second SGP_FALLOC on the same large folio will clear it, * making it uptodate and un-undoable if we fail later. */ index = folio_next_index(folio); /* Beware 32-bit wraparound */ if (!index) index--; /* * Inform shmem_writeout() how far we have reached. * No need for lock or barrier: we have the page lock. */ if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. * But mark it dirty so that memory pressure will swap rather * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: if (!error) file_modified(file); inode_unlock(inode); return error; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; buf->f_bavail = buf->f_bfree = sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); return 0; } /* * File creation. Allocate an inode, and we're done.. */ static int shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error; if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) return -EINVAL; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); error = simple_acl_create(dir, inode); if (error) goto out_iput; error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); d_make_persistent(dentry, inode); return error; out_iput: iput(inode); return error; } static int shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_acl_create(dir, inode); if (error) goto out_iput; d_tmpfile(file, inode); err_out: return finish_open_simple(file, error); out_iput: iput(inode); return error; } static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (error) return ERR_PTR(error); inc_nlink(dir); return NULL; } static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } /* * Link a file.. */ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. * But if an O_TMPFILE file is linked into the tmpfs, the * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) return ret; } ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) shmem_free_inode(inode->i_sb, 0); return ret; } dir->i_size += BOGO_DIRENT_SIZE; inode_inc_iversion(dir); return simple_link(old_dentry, dir, dentry); } static int shmem_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); dir->i_size -= BOGO_DIRENT_SIZE; inode_inc_iversion(dir); simple_unlink(dir, dentry); /* * For now, VFS can't deal with case-insensitive negative dentries, so * we invalidate them */ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) d_invalidate(dentry); return 0; } static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); drop_nlink(dir); return shmem_unlink(dir, dentry); } static int shmem_whiteout(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); if (!whiteout) return -ENOMEM; error = shmem_mknod(idmap, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); return error; } /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ static int shmem_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); bool had_offset = false; int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; error = simple_offset_add(shmem_get_offset_ctx(new_dir), new_dentry); if (error == -EBUSY) had_offset = true; else if (unlikely(error)) return error; if (flags & RENAME_WHITEOUT) { error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) { if (!had_offset) simple_offset_remove(shmem_get_offset_ctx(new_dir), new_dentry); return error; } } simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry); if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; new_dir->i_size += BOGO_DIRENT_SIZE; simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); return 0; } static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; struct folio *folio; char *link; len = strlen(symname) + 1; if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (error) goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { link = kmemdup(symname, len, GFP_KERNEL); if (!link) { error = -ENOMEM; goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; inode_set_cached_link(inode, link, len - 1); } else { inode_nohighmem(inode); inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); d_make_persistent(dentry, inode); return 0; out_remove_offset: simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; } static void shmem_put_link(void *arg) { folio_mark_accessed(arg); folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; int error; if (!dentry) { folio = filemap_get_folio(inode->i_mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); } folio_unlock(folio); } set_delayed_call(done, shmem_put_link, folio); return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR static int shmem_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); return 0; } static int shmem_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int ret, flags; if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) return -EOPNOTSUPP; flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); ret = shmem_set_inode_flags(inode, flags, dentry); if (ret) return ret; info->fsflags = flags; inode_set_ctime_current(inode); inode_inc_iversion(inode); return 0; } /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs * like ACLs, we also need to implement the security.* handlers at * filesystem level, though. */ /* * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; size_t ispace = 0; size_t len; CLASS(simple_xattrs, xattrs)(); if (IS_ERR(xattrs)) return PTR_ERR(xattrs); if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, xattr->value_len + XATTR_SECURITY_PREFIX_LEN); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } } for (xattr = xattr_array; xattr->name != NULL; xattr++) { CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) break; memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); if (simple_xattr_add(xattrs, new_xattr)) break; retain_and_null_ptr(new_xattr); } if (xattr->name != NULL) { if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return -ENOMEM; } smp_store_release(&info->xattrs, no_free_ptr(xattrs)); return 0; } static int shmem_xattr_handler_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); struct simple_xattrs *xattrs; xattrs = READ_ONCE(info->xattrs); if (!xattrs) return -ENODATA; name = xattr_full_name(handler, name); return simple_xattr_get(xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattrs *xattrs; struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags); if (IS_ERR_OR_NULL(xattrs)) return PTR_ERR(xattrs); if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->free_ispace < ispace) ispace = 0; else sbinfo->free_ispace -= ispace; raw_spin_unlock(&sbinfo->stat_lock); if (!ispace) return -ENOSPC; } old_xattr = simple_xattr_set(xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); simple_xattr_free_rcu(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); } if (ispace) { raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler shmem_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = shmem_xattr_handler_get, .set = shmem_xattr_handler_set, }; static const struct xattr_handler * const shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, &shmem_user_xattr_handler, NULL }; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs), buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif }; static struct dentry *shmem_get_parent(struct dentry *child) { return ERR_PTR(-ESTALE); } static int shmem_match(struct inode *ino, void *vfh) { __u32 *fh = vfh; __u64 inum = fh[2]; inum = (inum << 32) | fh[1]; return ino->i_ino == inum && fh[0] == ino->i_generation; } /* Find any alias of inode, but prefer a hashed alias */ static struct dentry *shmem_find_alias(struct inode *inode) { struct dentry *alias = d_find_alias(inode); return alias ?: d_find_any_alias(inode); } static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct inode *inode; struct dentry *dentry = NULL; u64 inum; if (fh_len < 3) return NULL; inum = fid->raw[2]; inum = (inum << 32) | fid->raw[1]; inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { dentry = shmem_find_alias(inode); iput(inode); } return dentry; } static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, struct inode *parent) { if (*len < 3) { *len = 3; return FILEID_INVALID; } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try * to do it once */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); } fh[0] = inode->i_generation; fh[1] = inode->i_ino; fh[2] = ((__u64)inode->i_ino) >> 32; *len = 3; return 1; } static const struct export_operations shmem_export_ops = { .get_parent = shmem_get_parent, .encode_fh = shmem_encode_fh, .fh_to_dentry = shmem_fh_to_dentry, }; enum shmem_param { Opt_gid, Opt_huge, Opt_mode, Opt_mpol, Opt_nr_blocks, Opt_nr_inodes, Opt_size, Opt_uid, Opt_inode32, Opt_inode64, Opt_noswap, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_usrquota_block_hardlimit, Opt_usrquota_inode_hardlimit, Opt_grpquota_block_hardlimit, Opt_grpquota_inode_hardlimit, Opt_casefold_version, Opt_casefold, Opt_strict_encoding, }; static const struct constant_table shmem_param_enums_huge[] = { {"never", SHMEM_HUGE_NEVER }, {"always", SHMEM_HUGE_ALWAYS }, {"within_size", SHMEM_HUGE_WITHIN_SIZE }, {"advise", SHMEM_HUGE_ADVISE }, {} }; const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_gid ("gid", Opt_gid), fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), fsparam_u32oct("mode", Opt_mode), fsparam_string("mpol", Opt_mpol), fsparam_string("nr_blocks", Opt_nr_blocks), fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_uid ("uid", Opt_uid), fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), #ifdef CONFIG_TMPFS_QUOTA fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif fsparam_string("casefold", Opt_casefold_version), fsparam_flag ("casefold", Opt_casefold), fsparam_flag ("strict_encoding", Opt_strict_encoding), {} }; #if IS_ENABLED(CONFIG_UNICODE) static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { struct shmem_options *ctx = fc->fs_private; int version = UTF8_LATEST; struct unicode_map *encoding; char *version_str = param->string + 5; if (!latest_version) { if (strncmp(param->string, "utf8-", 5)) return invalfc(fc, "Only UTF-8 encodings are supported " "in the format: utf8-<version number>"); version = utf8_parse_version(version_str); if (version < 0) return invalfc(fc, "Invalid UTF-8 version: %s", version_str); } encoding = utf8_load(version); if (IS_ERR(encoding)) { return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); } pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n", unicode_major(version), unicode_minor(version), unicode_rev(version)); ctx->encoding = encoding; return 0; } #else static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param, bool latest_version) { return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); } #endif static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { struct shmem_options *ctx = fc->fs_private; struct fs_parse_result result; unsigned long long size; char *rest; int opt; kuid_t kuid; kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_size: size = memparse(param->string, &rest); if (*rest == '%') { size <<= PAGE_SHIFT; size *= totalram_pages(); do_div(size, 100); rest++; } if (*rest) goto bad_value; ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; case Opt_mode: ctx->mode = result.uint_32 & 07777; break; case Opt_uid: kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, kuid)) goto bad_value; ctx->uid = kuid; break; case Opt_gid: kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, kgid)) goto bad_value; ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; break; case Opt_mpol: if (IS_ENABLED(CONFIG_NUMA)) { mpol_put(ctx->mpol); ctx->mpol = NULL; if (mpol_parse_str(param->string, &ctx->mpol)) goto bad_value; break; } goto unsupported_parameter; case Opt_inode32: ctx->full_inums = false; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_inode64: if (sizeof(ino_t) < 8) { return invalfc(fc, "Cannot use inode64 with <64bit inums in kernel\n"); } ctx->full_inums = true; ctx->seen |= SHMEM_SEEN_INUMS; break; case Opt_noswap: if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) { return invalfc(fc, "Turning off swap in unprivileged tmpfs mounts unsupported"); } ctx->noswap = true; break; case Opt_quota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); break; case Opt_usrquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_USR; break; case Opt_grpquota: if (fc->user_ns != &init_user_ns) return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; case Opt_usrquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "User quota block hardlimit too large."); ctx->qlimits.usrquota_bhardlimit = size; break; case Opt_grpquota_block_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) return invalfc(fc, "Group quota block hardlimit too large."); ctx->qlimits.grpquota_bhardlimit = size; break; case Opt_usrquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "User quota inode hardlimit too large."); ctx->qlimits.usrquota_ihardlimit = size; break; case Opt_grpquota_inode_hardlimit: size = memparse(param->string, &rest); if (*rest || !size) goto bad_value; if (size > SHMEM_QUOTA_MAX_INO_LIMIT) return invalfc(fc, "Group quota inode hardlimit too large."); ctx->qlimits.grpquota_ihardlimit = size; break; case Opt_casefold_version: return shmem_parse_opt_casefold(fc, param, false); case Opt_casefold: return shmem_parse_opt_casefold(fc, param, true); case Opt_strict_encoding: #if IS_ENABLED(CONFIG_UNICODE) ctx->strict_encoding = true; break; #else return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n"); #endif } return 0; unsupported_parameter: return invalfc(fc, "Unsupported parameter '%s'", param->key); bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } static char *shmem_next_opt(char **s) { char *sbegin = *s; char *p; if (sbegin == NULL) return NULL; /* * NUL-terminate this option: unfortunately, * mount options form a comma-separated list, * but mpol's nodelist may also contain commas. */ for (;;) { p = strchr(*s, ','); if (p == NULL) break; *s = p + 1; if (!isdigit(*(p+1))) { *p = '\0'; return sbegin; } } *s = NULL; return sbegin; } static int shmem_parse_monolithic(struct fs_context *fc, void *data) { return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* * Reconfigure a shmem filesystem. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; goto out; } if (percpu_counter_compare(&sbinfo->used_blocks, ctx->blocks) > 0) { err = "Too small a size for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { if (!sbinfo->max_inodes) { err = "Cannot retroactively limit inodes"; goto out; } if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } } if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && sbinfo->next_ino > UINT_MAX) { err = "Current inum too high to switch to 32-bit inums"; goto out; } /* * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap" * counterpart for (re-)enabling swap. */ if (ctx->noswap && !sbinfo->noswap) { err = "Cannot disable swap on remount"; goto out; } if (ctx->seen & SHMEM_SEEN_QUOTA && !sb_any_quota_loaded(fc->root->d_sb)) { err = "Cannot enable quota on remount"; goto out; } #ifdef CONFIG_TMPFS_QUOTA #define CHANGED_LIMIT(name) \ (ctx->qlimits.name## hardlimit && \ (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { err = "Cannot change global quota limit on remount"; goto out; } #endif /* CONFIG_TMPFS_QUOTA */ if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } if (ctx->noswap) sbinfo->noswap = true; raw_spin_unlock(&sbinfo->stat_lock); mpol_put(mpol); return 0; out: raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) { struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) seq_printf(seq, ",mode=%03ho", sbinfo->mode); if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbinfo->uid)); if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); /* * Showing inode{64,32} might be useful even if it's the system default, * since then people don't have to resort to checking both here and * /proc/config.gz to confirm 64-bit inums were successfully applied * (which may not even exist if IKCONFIG_PROC isn't enabled). * * We hide it when inode64 isn't the default and we are using 32-bit * inodes, since that probably just means the feature isn't even under * consideration. * * As such: * * +-----------------+-----------------+ * | TMPFS_INODE64=y | TMPFS_INODE64=n | * +------------------+-----------------+-----------------+ * | full_inums=true | show | show | * | full_inums=false | show | hide | * +------------------+-----------------+-----------------+ * */ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); #endif mpol = shmem_get_sbmpol(sbinfo); shmem_show_mpol(seq, mpol); mpol_put(mpol); if (sbinfo->noswap) seq_printf(seq, ",noswap"); #ifdef CONFIG_TMPFS_QUOTA if (sb_has_quota_active(root->d_sb, USRQUOTA)) seq_printf(seq, ",usrquota"); if (sb_has_quota_active(root->d_sb, GRPQUOTA)) seq_printf(seq, ",grpquota"); if (sbinfo->qlimits.usrquota_bhardlimit) seq_printf(seq, ",usrquota_block_hardlimit=%lld", sbinfo->qlimits.usrquota_bhardlimit); if (sbinfo->qlimits.grpquota_bhardlimit) seq_printf(seq, ",grpquota_block_hardlimit=%lld", sbinfo->qlimits.grpquota_bhardlimit); if (sbinfo->qlimits.usrquota_ihardlimit) seq_printf(seq, ",usrquota_inode_hardlimit=%lld", sbinfo->qlimits.usrquota_ihardlimit); if (sbinfo->qlimits.grpquota_ihardlimit) seq_printf(seq, ",grpquota_inode_hardlimit=%lld", sbinfo->qlimits.grpquota_ihardlimit); #endif return 0; } #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_TMPFS_QUOTA shmem_disable_quotas(sb); #endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); sb->s_fs_info = NULL; } #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS) static const struct dentry_operations shmem_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, }; #endif static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return error; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS /* * Per default we only allow half of the physical ram per * tmpfs instance, limiting inodes to one per page of lowmem; * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); if (!(ctx->seen & SHMEM_SEEN_INUMS)) ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); sbinfo->noswap = ctx->noswap; } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { pr_err("tmpfs: strict_encoding option without encoding is forbidden\n"); error = -EINVAL; goto failed; } if (ctx->encoding) { sb->s_encoding = ctx->encoding; set_default_d_op(sb, &shmem_ci_dentry_ops); if (ctx->strict_encoding) sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL; } #endif #else sb->s_flags |= SB_NOUSER; #endif /* CONFIG_TMPFS */ sb->s_d_flags |= DCACHE_DONTCACHE; sbinfo->max_blocks = ctx->blocks; sbinfo->max_inodes = ctx->inodes; sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) goto failed; } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; else sbinfo->huge = tmpfs_huge; #endif sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; #ifdef CONFIG_TMPFS_XATTR sb->s_xattr = shmem_xattr_handlers; #endif #ifdef CONFIG_TMPFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif uuid_t uuid; uuid_gen(&uuid); super_set_uuid(sb, uuid.b, sizeof(uuid)); #ifdef CONFIG_TMPFS_QUOTA if (ctx->seen & SHMEM_SEEN_QUOTA) { sb->dq_op = &shmem_quota_operations; sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; /* Copy the default limits from ctx into sbinfo */ memcpy(&sbinfo->qlimits, &ctx->qlimits, sizeof(struct shmem_quota_limits)); if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } #endif /* CONFIG_TMPFS_QUOTA */ inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, mk_vma_flags(VMA_NORESERVE_BIT)); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); if (!sb->s_root) goto failed; return 0; failed: shmem_put_super(sb); return error; } static int shmem_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, shmem_fill_super); } static void shmem_free_fc(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; if (ctx) { mpol_put(ctx->mpol); kfree(ctx); } } static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif }; static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; } static void shmem_free_in_core_inode(struct inode *inode) { if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); if (S_ISDIR(inode->i_mode)) simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) { struct shmem_inode_info *info = foo; inode_init_once(&info->vfs_inode); } static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } /* Keep the page in page cache instead of truncating it */ static int shmem_error_remove_folio(struct address_space *mapping, struct folio *folio) { return 0; } static const struct address_space_operations shmem_aops = { .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION .migrate_folio = migrate_folio, #endif .error_remove_folio = shmem_error_remove_folio, }; static const struct file_operations shmem_file_operations = { .mmap_prepare = shmem_mmap_prepare, .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = shmem_fallocate, .setlease = generic_setlease, #endif }; static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, .unlink = shmem_unlink, .symlink = shmem_symlink, .mkdir = shmem_mkdir, .rmdir = shmem_rmdir, .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .fileattr_get = shmem_fileattr_get, .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct inode_operations shmem_special_inode_operations = { .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, .set_acl = simple_set_acl, #endif }; static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, #endif #ifdef CONFIG_TMPFS_QUOTA .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif }; static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif #ifdef CONFIG_USERFAULTFD .uffd_ops = &shmem_uffd_ops, #endif }; static const struct vm_operations_struct shmem_anon_vm_ops = { .fault = shmem_fault, .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, #endif #ifdef CONFIG_USERFAULTFD .uffd_ops = &shmem_uffd_ops, #endif }; int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; ctx = kzalloc_obj(struct shmem_options); if (!ctx) return -ENOMEM; ctx->mode = 0777 | S_ISVTX; ctx->uid = current_fsuid(); ctx->gid = current_fsgid(); #if IS_ENABLED(CONFIG_UNICODE) ctx->encoding = NULL; #endif fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; #ifdef CONFIG_TMPFS fc->sb_flags |= SB_I_VERSION; #endif return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .init_fs_context = shmem_init_fs_context, #ifdef CONFIG_TMPFS .parameters = shmem_fs_parameters, #endif .kill_sb = kill_anon_super, .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, }; #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) #define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } #define TMPFS_ATTR_W(_name, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) #define TMPFS_ATTR_RW(_name, _show, _store) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) #define TMPFS_ATTR_RO(_name, _show) \ static struct kobj_attribute tmpfs_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) #if IS_ENABLED(CONFIG_UNICODE) static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { return sysfs_emit(buf, "supported\n"); } TMPFS_ATTR_RO(casefold, casefold_show); #endif static struct attribute *tmpfs_attributes[] = { #if IS_ENABLED(CONFIG_UNICODE) &tmpfs_attr_casefold.attr, #endif NULL }; static const struct attribute_group tmpfs_attribute_group = { .attrs = tmpfs_attributes, .name = "features" }; static struct kobject *tmpfs_kobj; static int __init tmpfs_sysfs_init(void) { int ret; tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj); if (!tmpfs_kobj) return -ENOMEM; ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group); if (ret) kobject_put(tmpfs_kobj); return ret; } #endif /* CONFIG_SYSFS && CONFIG_TMPFS */ void __init shmem_init(void) { int error; shmem_init_inodecache(); #ifdef CONFIG_TMPFS_QUOTA register_quota_format(&shmem_quota_format); #endif error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); pr_err("Could not kern_mount tmpfs\n"); goto out1; } #if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS) error = tmpfs_sysfs_init(); if (error) { pr_err("Could not init tmpfs sysfs\n"); goto out1; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ /* * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ if (!shmem_orders_configured) huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; out1: unregister_filesystem(&shmem_fs_type); out2: #ifdef CONFIG_TMPFS_QUOTA unregister_quota_format(&shmem_quota_format); #endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, SHMEM_HUGE_NEVER, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; int len = 0; int i; for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, shmem_huge == values[i] ? "%s[%s]" : "%s%s", i ? " " : "", shmem_format_huge(values[i])); } len += sysfs_emit_at(buf, len, "\n"); return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; memcpy(tmp, buf, count); tmp[count] = '\0'; if (count && tmp[count - 1] == '\n') tmp[count - 1] = '\0'; huge = shmem_parse_huge(tmp); if (huge == -EINVAL) return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; err = start_stop_khugepaged(); return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); static DEFINE_SPINLOCK(huge_shmem_orders_lock); static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int order = to_thpsize(kobj)->order; const char *output; if (test_bit(order, &huge_shmem_orders_always)) output = "[always] inherit within_size advise never"; else if (test_bit(order, &huge_shmem_orders_inherit)) output = "always [inherit] within_size advise never"; else if (test_bit(order, &huge_shmem_orders_within_size)) output = "always inherit [within_size] advise never"; else if (test_bit(order, &huge_shmem_orders_madvise)) output = "always inherit within_size [advise] never"; else output = "always inherit within_size advise [never]"; return sysfs_emit(buf, "%s\n", output); } static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { int order = to_thpsize(kobj)->order; ssize_t ret = count; if (sysfs_streq(buf, "always")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_always); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "inherit")) { /* Do not override huge allocation policy with non-PMD sized mTHP */ if (shmem_huge == SHMEM_HUGE_FORCE && order != HPAGE_PMD_ORDER) return -EINVAL; spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_madvise); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_inherit); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "within_size")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_madvise); set_bit(order, &huge_shmem_orders_within_size); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "advise")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); set_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else if (sysfs_streq(buf, "never")) { spin_lock(&huge_shmem_orders_lock); clear_bit(order, &huge_shmem_orders_always); clear_bit(order, &huge_shmem_orders_inherit); clear_bit(order, &huge_shmem_orders_within_size); clear_bit(order, &huge_shmem_orders_madvise); spin_unlock(&huge_shmem_orders_lock); } else { ret = -EINVAL; } if (ret > 0) { int err = start_stop_khugepaged(); if (err) ret = err; } return ret; } struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #if defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __init setup_transparent_hugepage_shmem(char *str) { int huge; huge = shmem_parse_huge(str); if (huge == -EINVAL) { pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); return huge; } shmem_huge = huge; return 1; } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); static int __init setup_transparent_hugepage_tmpfs(char *str) { int huge; huge = shmem_parse_huge(str); if (huge < 0) { pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n"); return huge; } tmpfs_huge = huge; return 1; } __setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs); static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_shmem(char *str) { char *token, *range, *policy, *subtoken; unsigned long always, inherit, madvise, within_size; char *start_size, *end_size; int start, end, nr; char *p; if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; strscpy(str_dup, str); always = huge_shmem_orders_always; inherit = huge_shmem_orders_inherit; madvise = huge_shmem_orders_madvise; within_size = huge_shmem_orders_within_size; p = str_dup; while ((token = strsep(&p, ";")) != NULL) { range = strsep(&token, ":"); policy = token; if (!policy) goto err; while ((subtoken = strsep(&range, ",")) != NULL) { if (strchr(subtoken, '-')) { start_size = strsep(&subtoken, "-"); end_size = subtoken; start = get_order_from_str(start_size, THP_ORDERS_ALL_FILE_DEFAULT); end = get_order_from_str(end_size, THP_ORDERS_ALL_FILE_DEFAULT); } else { start_size = end_size = subtoken; start = end = get_order_from_str(subtoken, THP_ORDERS_ALL_FILE_DEFAULT); } if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } if (start > end) goto err; nr = end - start + 1; if (!strcmp(policy, "always")) { bitmap_set(&always, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "advise")) { bitmap_set(&madvise, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "inherit")) { bitmap_set(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else if (!strcmp(policy, "within_size")) { bitmap_set(&within_size, start, nr); bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); } else if (!strcmp(policy, "never")) { bitmap_clear(&inherit, start, nr); bitmap_clear(&madvise, start, nr); bitmap_clear(&always, start, nr); bitmap_clear(&within_size, start, nr); } else { pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); goto err; } } } huge_shmem_orders_always = always; huge_shmem_orders_madvise = madvise; huge_shmem_orders_inherit = inherit; huge_shmem_orders_within_size = within_size; shmem_orders_configured = true; return 1; err: pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); return 0; } __setup("thp_shmem=", setup_thp_shmem); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ /* * tiny-shmem: simple shmemfs and tmpfs using ramfs code * * This is intended for small system where the benefits of the full * shmem code (swap-backed and resource-limited) are outweighed by * their complexity. On systems without swap this code should be * effectively equivalent, but much lighter weight. */ static struct file_system_type shmem_fs_type = { .name = "tmpfs", .init_fs_context = ramfs_init_fs_context, .parameters = ramfs_fs_parameters, .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); } int shmem_unuse(unsigned int type) { return 0; } int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } void shmem_unlock_mapping(struct address_space *mapping) { } #ifdef CONFIG_MMU unsigned long shmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return mm_get_unmapped_area(file, addr, len, pgoff, flags); } #endif void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend) { truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations static inline int shmem_acct_size(unsigned long flags, loff_t size) { return 0; } static inline void shmem_unacct_size(unsigned long flags, loff_t size) { } static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, vma_flags_t flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); } #endif /* CONFIG_SHMEM */ /* common code */ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, vma_flags_t flags, unsigned int i_flags) { const unsigned long shmem_flags = vma_flags_test(&flags, VMA_NORESERVE_BIT) ? SHMEM_F_NORESERVE : 0; struct inode *inode; struct file *res; if (IS_ERR(mnt)) return ERR_CAST(mnt); if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (is_idmapped_mnt(mnt)) return ERR_PTR(-EINVAL); if (shmem_acct_size(shmem_flags, size)) return ERR_PTR(-ENOMEM); inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (IS_ERR(inode)) { shmem_unacct_size(shmem_flags, size); return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (!IS_ERR(res)) res = alloc_file_pseudo(inode, mnt, name, O_RDWR, &shmem_file_operations); if (IS_ERR(res)) iput(inode); return res; } /** * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a * higher layer. The users are the big_key and shm implementations. LSM * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_kernel_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup(const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(shm_mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs * @mnt: the tmpfs mount where the file will be created * @name: name for dentry (to be seen in /proc/<pid>/maps) * @size: size to be set for the file * @flags: VMA_NORESERVE_BIT suppresses pre-accounting of the entire object size */ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, vma_flags_t flags) { return __shmem_file_setup(mnt, name, size, flags, 0); } EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vma_flags_t flags) { loff_t size = end - start; /* * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). */ return shmem_kernel_file_setup("dev/zero", size, flags); } /** * shmem_zero_setup - setup a shared anonymous mapping * @vma: the vma to be mmapped is prepared by do_mmap * Returns: 0 on success, or error */ int shmem_zero_setup(struct vm_area_struct *vma) { struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->flags); if (IS_ERR(file)) return PTR_ERR(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA * descriptor for convenience. * @desc: Describes VMA * Returns: 0 on success, or error */ int shmem_zero_setup_desc(struct vm_area_desc *desc) { struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vma_flags); if (IS_ERR(file)) return PTR_ERR(file); desc->vm_file = file; desc->vm_ops = &shmem_anon_vm_ops; return 0; } /** * shmem_read_folio_gfp - read into page cache, using specified page allocation flags. * @mapping: the folio's address_space * @index: the folio index * @gfp: the page allocator flags to use if allocating * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; struct folio *folio; int error; error = shmem_get_folio_gfp(inode, index, i_size_read(inode), &folio, SGP_CACHE, gfp, NULL, NULL); if (error) return ERR_PTR(error); folio_unlock(folio); return folio; #else /* * The tiny !SHMEM case uses ramfs without swap */ return mapping_read_folio_gfp(mapping, index, gfp); #endif } EXPORT_SYMBOL_GPL(shmem_read_folio_gfp); struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp); struct page *page; if (IS_ERR(folio)) return &folio->page; page = folio_file_page(folio, index); if (PageHWPoison(page)) { folio_put(folio); return ERR_PTR(-EIO); } return page; } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
12 12 12 1 1 1 1 1 12 1 19 19 18 1 19 19 19 12 12 1 11 1 1 1 1 1 4 12 1 11 10 10 1 9 4 5 1 4 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2017 Red Hat, Inc. */ #include <linux/cred.h> #include <linux/file.h> #include <linux/filelock.h> #include <linux/mount.h> #include <linux/xattr.h> #include <linux/uio.h> #include <linux/uaccess.h> #include <linux/security.h> #include <linux/fs.h> #include <linux/backing-file.h> #include "overlayfs.h" static char ovl_whatisit(struct inode *inode, struct inode *realinode) { if (realinode != ovl_inode_upper(inode)) return 'l'; if (ovl_has_upperdata(inode)) return 'u'; else return 'm'; } static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) { struct inode *realinode = d_inode(realpath->dentry); struct inode *inode = file_inode(file); struct mnt_idmap *real_idmap; struct file *realfile; int flags = file->f_flags | OVL_OPEN_FLAGS; int acc_mode = ACC_MODE(flags); int err; if (flags & O_APPEND) acc_mode |= MAY_APPEND; with_ovl_creds(inode->i_sb) { real_idmap = mnt_idmap(realpath->mnt); err = inode_permission(real_idmap, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; realfile = backing_file_open(file_user_path(file), flags, realpath, current_cred()); } } pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", file, file, ovl_whatisit(inode, realinode), file->f_flags, realfile, IS_ERR(realfile) ? 0 : realfile->f_flags); return realfile; } #define OVL_SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT) static int ovl_change_flags(struct file *file, unsigned int flags) { struct inode *inode = file_inode(file); int err; flags &= OVL_SETFL_MASK; if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; if (file->f_op->check_flags) { err = file->f_op->check_flags(flags); if (err) return err; } spin_lock(&file->f_lock); file->f_flags = (file->f_flags & ~OVL_SETFL_MASK) | flags; file->f_iocb_flags = iocb_flags(file); spin_unlock(&file->f_lock); return 0; } struct ovl_file { struct file *realfile; struct file *upperfile; }; struct ovl_file *ovl_file_alloc(struct file *realfile) { struct ovl_file *of = kzalloc_obj(struct ovl_file); if (unlikely(!of)) return NULL; of->realfile = realfile; return of; } void ovl_file_free(struct ovl_file *of) { fput(of->realfile); if (of->upperfile) fput(of->upperfile); kfree(of); } static bool ovl_is_real_file(const struct file *realfile, const struct path *realpath) { return file_inode(realfile) == d_inode(realpath->dentry); } static struct file *ovl_real_file_path(const struct file *file, const struct path *realpath) { struct ovl_file *of = file->private_data; struct file *realfile = of->realfile; if (WARN_ON_ONCE(!realpath->dentry)) return ERR_PTR(-EIO); /* * If the realfile that we want is not where the data used to be at * open time, either we'd been copied up, or it's an fsync of a * metacopied file. We need the upperfile either way, so see if it * is already opened and if it is not then open and store it. */ if (unlikely(!ovl_is_real_file(realfile, realpath))) { struct file *upperfile = READ_ONCE(of->upperfile); struct file *old; if (!upperfile) { /* Nobody opened upperfile yet */ upperfile = ovl_open_realfile(file, realpath); if (IS_ERR(upperfile)) return upperfile; /* Store the upperfile for later */ old = cmpxchg_release(&of->upperfile, NULL, upperfile); if (old) { /* Someone opened upperfile before us */ fput(upperfile); upperfile = old; } } /* * Stored file must be from the right inode, unless someone's * been corrupting the upper layer. */ if (WARN_ON_ONCE(!ovl_is_real_file(upperfile, realpath))) return ERR_PTR(-EIO); realfile = upperfile; } /* Did the flags change since open? */ if (unlikely((file->f_flags ^ realfile->f_flags) & ~OVL_OPEN_FLAGS)) { int err = ovl_change_flags(realfile, file->f_flags); if (err) return ERR_PTR(err); } return realfile; } static struct file *ovl_real_file(const struct file *file) { struct dentry *dentry = file_dentry(file); struct path realpath; int err; if (d_is_dir(dentry)) { struct file *f = ovl_dir_real_file(file, false); if (WARN_ON_ONCE(!f)) return ERR_PTR(-EIO); return f; } /* lazy lookup and verify of lowerdata */ err = ovl_verify_lowerdata(dentry); if (err) return ERR_PTR(err); ovl_path_realdata(dentry, &realpath); return ovl_real_file_path(file, &realpath); } static int ovl_open(struct inode *inode, struct file *file) { struct dentry *dentry = file_dentry(file); struct file *realfile; struct path realpath; struct ovl_file *of; int err; /* lazy lookup and verify lowerdata */ err = ovl_verify_lowerdata(dentry); if (err) return err; err = ovl_maybe_copy_up(dentry, file->f_flags); if (err) return err; /* No longer need these flags, so don't pass them on to underlying fs */ file->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); ovl_path_realdata(dentry, &realpath); if (!realpath.dentry) return -EIO; realfile = ovl_open_realfile(file, &realpath); if (IS_ERR(realfile)) return PTR_ERR(realfile); of = ovl_file_alloc(realfile); if (!of) { fput(realfile); return -ENOMEM; } file->private_data = of; return 0; } static int ovl_release(struct inode *inode, struct file *file) { ovl_file_free(file->private_data); return 0; } static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); struct file *realfile; loff_t ret; /* * The two special cases below do not need to involve real fs, * so we can optimizing concurrent callers. */ if (offset == 0) { if (whence == SEEK_CUR) return file->f_pos; if (whence == SEEK_SET) return vfs_setpos(file, 0, 0); } realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); /* * Overlay file f_pos is the master copy that is preserved * through copy up and modified on read/write, but only real * fs knows how to SEEK_HOLE/SEEK_DATA and real fs may impose * limitations that are more strict than ->s_maxbytes for specific * files, so we use the real file to perform seeks. */ ovl_inode_lock(inode); realfile->f_pos = file->f_pos; with_ovl_creds(inode->i_sb) ret = vfs_llseek(realfile, offset, whence); file->f_pos = realfile->f_pos; ovl_inode_unlock(inode); return ret; } static void ovl_file_modified(struct file *file) { /* Update size/mtime */ ovl_copyattr(file_inode(file)); } static void ovl_file_end_write(struct kiocb *iocb, ssize_t ret) { ovl_file_modified(iocb->ki_filp); } static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; struct timespec64 ctime, uctime; struct timespec64 mtime, umtime; if (file->f_flags & O_NOATIME) return; inode = file_inode(file); upperinode = ovl_inode_upper(inode); if (!upperinode) return; ctime = inode_get_ctime(inode); uctime = inode_get_ctime(upperinode); mtime = inode_get_mtime(inode); umtime = inode_get_mtime(upperinode); if ((!timespec64_equal(&mtime, &umtime)) || !timespec64_equal(&ctime, &uctime)) { inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode)); inode_set_ctime_to_ts(inode, uctime); } touch_atime(&file->f_path); } static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct file *realfile; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), .accessed = ovl_file_accessed, }; if (!iov_iter_count(iter)) return 0; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); return backing_file_read_iter(realfile, iter, iocb, iocb->ki_flags, &ctx); } static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct file *realfile; ssize_t ret; int ifl = iocb->ki_flags; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .end_write = ovl_file_end_write, }; if (!iov_iter_count(iter)) return 0; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); realfile = ovl_real_file(file); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx); out_unlock: inode_unlock(inode); return ret; } static ssize_t ovl_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct file *realfile; ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(in)->i_sb), .accessed = ovl_file_accessed, }; struct kiocb iocb; realfile = ovl_real_file(in); if (IS_ERR(realfile)) return PTR_ERR(realfile); init_sync_kiocb(&iocb, in); iocb.ki_pos = *ppos; ret = backing_file_splice_read(realfile, &iocb, pipe, len, flags, &ctx); *ppos = iocb.ki_pos; return ret; } /* * Calling iter_file_splice_write() directly from overlay's f_op may deadlock * due to lock order inversion between pipe->mutex in iter_file_splice_write() * and file_start_write(realfile) in ovl_write_iter(). * * So do everything ovl_write_iter() does and call iter_file_splice_write() on * the real file. */ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct file *realfile; struct inode *inode = file_inode(out); ssize_t ret; struct backing_file_ctx ctx = { .cred = ovl_creds(inode->i_sb), .end_write = ovl_file_end_write, }; struct kiocb iocb; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); realfile = ovl_real_file(out); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; init_sync_kiocb(&iocb, out); iocb.ki_pos = *ppos; ret = backing_file_splice_write(pipe, realfile, &iocb, len, flags, &ctx); *ppos = iocb.ki_pos; out_unlock: inode_unlock(inode); return ret; } static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); enum ovl_path_type type; struct path upperpath; struct file *upperfile; int ret; ret = ovl_sync_status(OVL_FS(file_inode(file)->i_sb)); if (ret <= 0) return ret; /* Don't sync lower file for fear of receiving EROFS error */ type = ovl_path_type(dentry); if (!OVL_TYPE_UPPER(type) || (datasync && OVL_TYPE_MERGE(type))) return 0; ovl_path_upper(dentry, &upperpath); upperfile = ovl_real_file_path(file, &upperpath); if (IS_ERR(upperfile)) return PTR_ERR(upperfile); with_ovl_creds(file_inode(file)->i_sb) return vfs_fsync_range(upperfile, start, end, datasync); } static int ovl_mmap(struct file *file, struct vm_area_struct *vma) { struct ovl_file *of = file->private_data; struct backing_file_ctx ctx = { .cred = ovl_creds(file_inode(file)->i_sb), .accessed = ovl_file_accessed, }; return backing_file_mmap(of->realfile, vma, &ctx); } static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct file *realfile; int ret; inode_lock(inode); /* Update mode */ ovl_copyattr(inode); ret = file_remove_privs(file); if (ret) goto out_unlock; realfile = ovl_real_file(file); ret = PTR_ERR(realfile); if (IS_ERR(realfile)) goto out_unlock; with_ovl_creds(inode->i_sb) ret = vfs_fallocate(realfile, mode, offset, len); /* Update size */ ovl_file_modified(file); out_unlock: inode_unlock(inode); return ret; } static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct file *realfile; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); with_ovl_creds(file_inode(file)->i_sb) return vfs_fadvise(realfile, offset, len, advice); } enum ovl_copyop { OVL_COPY, OVL_CLONE, OVL_DEDUPE, }; static loff_t ovl_copyfile(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int flags, enum ovl_copyop op) { struct inode *inode_out = file_inode(file_out); struct file *realfile_in, *realfile_out; loff_t ret; inode_lock(inode_out); if (op != OVL_DEDUPE) { /* Update mode */ ovl_copyattr(inode_out); ret = file_remove_privs(file_out); if (ret) goto out_unlock; } realfile_out = ovl_real_file(file_out); ret = PTR_ERR(realfile_out); if (IS_ERR(realfile_out)) goto out_unlock; realfile_in = ovl_real_file(file_in); ret = PTR_ERR(realfile_in); if (IS_ERR(realfile_in)) goto out_unlock; with_ovl_creds(file_inode(file_out)->i_sb) { switch (op) { case OVL_COPY: ret = vfs_copy_file_range(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; case OVL_CLONE: ret = vfs_clone_file_range(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; case OVL_DEDUPE: ret = vfs_dedupe_file_range_one(realfile_in, pos_in, realfile_out, pos_out, len, flags); break; } } /* Update size */ ovl_file_modified(file_out); out_unlock: inode_unlock(inode_out); return ret; } static ssize_t ovl_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags) { return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, flags, OVL_COPY); } static loff_t ovl_remap_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { enum ovl_copyop op; if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; if (remap_flags & REMAP_FILE_DEDUP) op = OVL_DEDUPE; else op = OVL_CLONE; /* * Don't copy up because of a dedupe request, this wouldn't make sense * most of the time (data would be duplicated instead of deduplicated). */ if (op == OVL_DEDUPE && (!ovl_inode_upper(file_inode(file_in)) || !ovl_inode_upper(file_inode(file_out)))) return -EPERM; return ovl_copyfile(file_in, pos_in, file_out, pos_out, len, remap_flags, op); } static int ovl_flush(struct file *file, fl_owner_t id) { struct file *realfile; int err = 0; realfile = ovl_real_file(file); if (IS_ERR(realfile)) return PTR_ERR(realfile); if (realfile->f_op->flush) { with_ovl_creds(file_inode(file)->i_sb) err = realfile->f_op->flush(realfile, id); } return err; } const struct file_operations ovl_file_operations = { .open = ovl_open, .release = ovl_release, .llseek = ovl_llseek, .read_iter = ovl_read_iter, .write_iter = ovl_write_iter, .fsync = ovl_fsync, .mmap = ovl_mmap, .fallocate = ovl_fallocate, .fadvise = ovl_fadvise, .flush = ovl_flush, .splice_read = ovl_splice_read, .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, .setlease = generic_setlease, };
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> struct configfs_fragment { atomic_t frag_count; struct rw_semaphore frag_sem; bool frag_dead; }; void put_fragment(struct configfs_fragment *); struct configfs_fragment *get_fragment(struct configfs_fragment *); struct configfs_dirent { atomic_t s_count; int s_dependent_count; struct list_head s_sibling; struct list_head s_children; int s_links; void * s_element; int s_type; umode_t s_mode; struct dentry * s_dentry; struct iattr * s_iattr; #ifdef CONFIG_LOCKDEP int s_depth; #endif struct configfs_fragment *s_frag; }; #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 #define CONFIGFS_ITEM_ATTR 0x0004 #define CONFIGFS_ITEM_BIN_ATTR 0x0008 #define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) #define CONFIGFS_PINNED \ (CONFIGFS_ROOT | CONFIGFS_DIR | CONFIGFS_ITEM_LINK) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); extern struct inode *configfs_create(struct dentry *, umode_t mode); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_create_bin_file(struct config_item *, const struct configfs_bin_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int, struct configfs_fragment *); extern int configfs_dirent_is_ready(struct configfs_dirent *); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); extern int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); extern void configfs_release_fs(void); extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; extern const struct file_operations configfs_bin_file_operations; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; extern int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); int configfs_create_link(struct configfs_dirent *target, struct dentry *parent, struct dentry *dentry, char *body); static inline struct config_item * to_item(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct config_item *) sd->s_element); } static inline struct configfs_attribute * to_attr(struct dentry * dentry) { struct configfs_dirent * sd = dentry->d_fsdata; return ((struct configfs_attribute *) sd->s_element); } static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry) { struct configfs_attribute *attr = to_attr(dentry); return container_of(attr, struct configfs_bin_attribute, cb_attr); } static inline struct config_item *configfs_get_config_item(struct dentry *dentry) { struct config_item * item = NULL; spin_lock(&dentry->d_lock); if (!d_unhashed(dentry)) { struct configfs_dirent * sd = dentry->d_fsdata; item = config_item_get(sd->s_element); } spin_unlock(&dentry->d_lock); return item; } static inline void release_configfs_dirent(struct configfs_dirent * sd) { if (!(sd->s_type & CONFIGFS_ROOT)) { kfree(sd->s_iattr); put_fragment(sd->s_frag); kmem_cache_free(configfs_dir_cachep, sd); } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); atomic_inc(&sd->s_count); } return sd; } static inline void configfs_put(struct configfs_dirent * sd) { WARN_ON(!atomic_read(&sd->s_count)); if (atomic_dec_and_test(&sd->s_count)) release_configfs_dirent(sd); }
1 1 1 9 1 9 10 1 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * Plantronics USB HID Driver * * Copyright (c) 2014 JD Cole <jd.cole@plantronics.com> * Copyright (c) 2015-2018 Terry Junge <terry.junge@plantronics.com> */ #include "hid-ids.h" #include <linux/hid.h> #include <linux/module.h> #include <linux/jiffies.h> #define PLT_HID_1_0_PAGE 0xffa00000 #define PLT_HID_2_0_PAGE 0xffa20000 #define PLT_BASIC_TELEPHONY 0x0003 #define PLT_BASIC_EXCEPTION 0x0005 #define PLT_VOL_UP 0x00b1 #define PLT_VOL_DOWN 0x00b2 #define PLT_MIC_MUTE 0x00b5 #define PLT1_VOL_UP (PLT_HID_1_0_PAGE | PLT_VOL_UP) #define PLT1_VOL_DOWN (PLT_HID_1_0_PAGE | PLT_VOL_DOWN) #define PLT1_MIC_MUTE (PLT_HID_1_0_PAGE | PLT_MIC_MUTE) #define PLT2_VOL_UP (PLT_HID_2_0_PAGE | PLT_VOL_UP) #define PLT2_VOL_DOWN (PLT_HID_2_0_PAGE | PLT_VOL_DOWN) #define PLT2_MIC_MUTE (PLT_HID_2_0_PAGE | PLT_MIC_MUTE) #define HID_TELEPHONY_MUTE (HID_UP_TELEPHONY | 0x2f) #define HID_CONSUMER_MUTE (HID_UP_CONSUMER | 0xe2) #define PLT_DA60 0xda60 #define PLT_BT300_MIN 0x0413 #define PLT_BT300_MAX 0x0418 #define PLT_DOUBLE_KEY_TIMEOUT 5 /* ms */ struct plt_drv_data { unsigned long device_type; unsigned long last_key_ts; unsigned long double_key_to; __u16 last_key; }; static int plantronics_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { unsigned short mapped_key; struct plt_drv_data *drv_data = hid_get_drvdata(hdev); unsigned long plt_type = drv_data->device_type; int allow_mute = usage->hid == HID_TELEPHONY_MUTE; int allow_consumer = field->application == HID_CP_CONSUMERCONTROL && (usage->hid & HID_USAGE_PAGE) == HID_UP_CONSUMER && usage->hid != HID_CONSUMER_MUTE; /* special case for PTT products */ if (field->application == HID_GD_JOYSTICK) goto defaulted; /* non-standard types or multi-HID interfaces - plt_type is PID */ if (!(plt_type & HID_USAGE_PAGE)) { switch (plt_type) { case PLT_DA60: if (allow_consumer) goto defaulted; if (usage->hid == HID_CONSUMER_MUTE) { mapped_key = KEY_MICMUTE; goto mapped; } break; default: if (allow_consumer || allow_mute) goto defaulted; } goto ignored; } /* handle standard consumer control mapping */ /* and standard telephony mic mute mapping */ if (allow_consumer || allow_mute) goto defaulted; /* handle vendor unique types - plt_type is 0xffa0uuuu or 0xffa2uuuu */ /* if not 'basic telephony compliant' - map vendor unique controls */ if (!((plt_type & HID_USAGE) >= PLT_BASIC_TELEPHONY && (plt_type & HID_USAGE) != PLT_BASIC_EXCEPTION) && !((field->application ^ plt_type) & HID_USAGE_PAGE)) switch (usage->hid) { case PLT1_VOL_UP: case PLT2_VOL_UP: mapped_key = KEY_VOLUMEUP; goto mapped; case PLT1_VOL_DOWN: case PLT2_VOL_DOWN: mapped_key = KEY_VOLUMEDOWN; goto mapped; case PLT1_MIC_MUTE: case PLT2_MIC_MUTE: mapped_key = KEY_MICMUTE; goto mapped; } /* * Future mapping of call control or other usages, * if and when keys are defined would go here * otherwise, ignore everything else that was not mapped */ ignored: hid_dbg(hdev, "usage: %08x (appl: %08x) - ignored\n", usage->hid, field->application); return -1; defaulted: hid_dbg(hdev, "usage: %08x (appl: %08x) - defaulted\n", usage->hid, field->application); return 0; mapped: hid_map_usage_clear(hi, usage, bit, max, EV_KEY, mapped_key); hid_dbg(hdev, "usage: %08x (appl: %08x) - mapped to key %d\n", usage->hid, field->application, mapped_key); return 1; } static int plantronics_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct plt_drv_data *drv_data = hid_get_drvdata(hdev); unsigned long prev_tsto, cur_ts; __u16 prev_key, cur_key; /* Usages are filtered in plantronics_usages. */ /* HZ too low for ms resolution - double key detection disabled */ /* or it is a key release - handle key presses only. */ if (!drv_data->double_key_to || !value) return 0; prev_tsto = drv_data->last_key_ts + drv_data->double_key_to; cur_ts = drv_data->last_key_ts = jiffies; prev_key = drv_data->last_key; cur_key = drv_data->last_key = usage->code; /* If the same key occurs in <= double_key_to -- ignore it */ if (prev_key == cur_key && time_before_eq(cur_ts, prev_tsto)) { hid_dbg(hdev, "double key %d ignored\n", cur_key); return 1; /* Ignore the repeated key. */ } return 0; } static unsigned long plantronics_device_type(struct hid_device *hdev) { unsigned i, col_page; unsigned long plt_type = hdev->product; /* multi-HID interfaces? - plt_type is PID */ if (plt_type >= PLT_BT300_MIN && plt_type <= PLT_BT300_MAX) goto exit; /* determine primary vendor page */ for (i = 0; i < hdev->maxcollection; i++) { col_page = hdev->collection[i].usage & HID_USAGE_PAGE; if (col_page == PLT_HID_2_0_PAGE) { plt_type = hdev->collection[i].usage; break; } if (col_page == PLT_HID_1_0_PAGE) plt_type = hdev->collection[i].usage; } exit: hid_dbg(hdev, "plt_type decoded as: %08lx\n", plt_type); return plt_type; } static int plantronics_probe(struct hid_device *hdev, const struct hid_device_id *id) { struct plt_drv_data *drv_data; int ret; drv_data = devm_kzalloc(&hdev->dev, sizeof(*drv_data), GFP_KERNEL); if (!drv_data) return -ENOMEM; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); return ret; } drv_data->device_type = plantronics_device_type(hdev); drv_data->double_key_to = msecs_to_jiffies(PLT_DOUBLE_KEY_TIMEOUT); drv_data->last_key_ts = jiffies - drv_data->double_key_to; /* if HZ does not allow ms resolution - disable double key detection */ if (drv_data->double_key_to < PLT_DOUBLE_KEY_TIMEOUT) drv_data->double_key_to = 0; hid_set_drvdata(hdev, drv_data); ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT | HID_CONNECT_HIDINPUT_FORCE | HID_CONNECT_HIDDEV_FORCE); if (ret) hid_err(hdev, "hw start failed\n"); return ret; } static const struct hid_device_id plantronics_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_PLANTRONICS, HID_ANY_ID) }, { } }; MODULE_DEVICE_TABLE(hid, plantronics_devices); static const struct hid_usage_id plantronics_usages[] = { { HID_CP_VOLUMEUP, EV_KEY, HID_ANY_ID }, { HID_CP_VOLUMEDOWN, EV_KEY, HID_ANY_ID }, { HID_TELEPHONY_MUTE, EV_KEY, HID_ANY_ID }, { HID_CONSUMER_MUTE, EV_KEY, HID_ANY_ID }, { PLT2_VOL_UP, EV_KEY, HID_ANY_ID }, { PLT2_VOL_DOWN, EV_KEY, HID_ANY_ID }, { PLT2_MIC_MUTE, EV_KEY, HID_ANY_ID }, { PLT1_VOL_UP, EV_KEY, HID_ANY_ID }, { PLT1_VOL_DOWN, EV_KEY, HID_ANY_ID }, { PLT1_MIC_MUTE, EV_KEY, HID_ANY_ID }, { HID_TERMINATOR, HID_TERMINATOR, HID_TERMINATOR } }; static struct hid_driver plantronics_driver = { .name = "plantronics", .id_table = plantronics_devices, .usage_table = plantronics_usages, .input_mapping = plantronics_input_mapping, .event = plantronics_event, .probe = plantronics_probe, }; module_hid_driver(plantronics_driver); MODULE_AUTHOR("JD Cole <jd.cole@plantronics.com>"); MODULE_AUTHOR("Terry Junge <terry.junge@plantronics.com>"); MODULE_DESCRIPTION("Plantronics USB HID Driver"); MODULE_LICENSE("GPL");
291 90 200 291 5 199 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic timeout handling of requests. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fault-inject.h> #include "blk.h" #include "blk-mq.h" #ifdef CONFIG_FAIL_IO_TIMEOUT static DECLARE_FAULT_ATTR(fail_io_timeout); static int __init setup_fail_io_timeout(char *str) { return setup_fault_attr(&fail_io_timeout, str); } __setup("fail_io_timeout=", setup_fail_io_timeout); bool __blk_should_fake_timeout(struct request_queue *q) { return should_fail(&fail_io_timeout, 1); } EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", NULL, &fail_io_timeout); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_io_timeout_debugfs); ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); return sprintf(buf, "%d\n", set != 0); } ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct gendisk *disk = dev_to_disk(dev); int val; if (count) { struct request_queue *q = disk->queue; char *p = (char *) buf; val = simple_strtoul(p, &p, 10); if (val) blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); else blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); } return count; } #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { /* * All we need to ensure is that timeout scan takes place * immediately and that scan sees the new timeout value. * No need for fancy synchronizations. */ WRITE_ONCE(req->deadline, jiffies); kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); static unsigned long blk_timeout_mask __read_mostly; static int __init blk_timeout_init(void) { blk_timeout_mask = roundup_pow_of_two(HZ) - 1; return 0; } late_initcall(blk_timeout_init); /* * Just a rough estimate, we don't care about specific values for timeouts. */ static inline unsigned long blk_round_jiffies(unsigned long j) { return (j + blk_timeout_mask) + 1; } unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; return timeout; } /** * blk_add_timer - Start timeout timer for a single request * @req: request that is about to start running. * * Notes: * Each request has its own timer, and as it is added to the queue, we * set up the timer. When the request completes, we cancel the timer. */ void blk_add_timer(struct request *req) { struct request_queue *q = req->q; unsigned long expiry; /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. */ if (!req->timeout) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; expiry = jiffies + req->timeout; WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { unsigned long diff = q->timeout.expires - expiry; /* * Due to added timer slack to group timers, the timer * will often be a little in front of what we asked for. * So apply some tolerance here too, otherwise we keep * modifying the timer because expires for value X * will be X + something. */ if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) mod_timer(&q->timeout, expiry); } }
124 122 1 2 1 4 17 2 2 87 84 3 106 76 7 72 1 5 1 5 90 175 24 21 23 24 1 23 22 2 1 24 22 3 3 2 2 24 28 17 1 3 3 1 1 1 2 1 1 1 1 1 21 122 123 123 122 122 1 122 77 46 48 59 47 124 8 122 122 122 1 1 4 4 11 11 3 1 1 1 2 1 1 3 1 1 2 2 3 42 96 234 232 228 220 106 65 49 84 68 65 16 133 4 98 3 1 11 1 15 120 93 133 69 72 62 12 1 69 66 4 4 67 19 5 3 22 149 202 188 12 10 14 14 1 9 11 12 1 176 199 1 54 149 53 155 156 156 67 58 104 24 15 1 116 117 5 79 98 81 111 109 109 105 48 107 96 11 90 91 91 89 91 91 13 90 7 6 6 3 3 2 8 11 11 1 11 11 69 108 38 32 63 21 20 23 50 4 48 12 36 8 81 104 10 113 72 72 115 9 116 1 119 185 2 2 5 4 2 1 1 2 1 4 4 1 3 67 24 9 11 55 23 28 34 67 47 43 15 38 37 3 13 1 4 1 5 6 2 4 1 3 6 3 6 2 5 5 5 5 5 1 4 1 2 1 1 2 1 1 1 8 2 9 3 7 6 4 9 7 3 1 2 1 2 1 1 2 3 1 2 2 9 6 4 1 5 1 1 1 4 1 3 9 1 1 1 1 32 32 1 1 7 18 11 1 3 6 5 1 1 8 8 2 7 7 5 4 1 3 8 1 9 9 4 2 5 6 3 1 1 1 2 4 6 4 1 1 31 32 31 67 1 63 2 67 4 4 4 43 47 46 3 4 1 39 39 3 37 1 23 1 2 11 1 9 3 26 1 26 1 22 13 2 1 3 18 20 19 1 3 34 36 2 35 2 8 3 29 38 31 8 4 60 1 60 2 1 44 1 2 2 1 1 219 221 175 83 8 126 52 83 218 50 41 1 40 39 39 1 1 5 5 5 5 5 4 5 96 7 93 4 78 10 1 12 1 12 10 95 95 93 90 5 4 89 89 81 78 4 82 18 3 248 55 256 257 257 64 2 52 13 6 44 8 40 7 64 64 64 64 63 64 63 64 63 18 2 16 7 1 1 5 1 1 1 1 7 7 6 4 4 2 1 1 2 1 2 3 3 2 2 1 19 3 16 19 6 14 4 1 4 2 1 1 1 1 1 1 18 15 2 2 1 16 16 2 16 16 2 40 1 1 1 4 33 15 3 16 163 1 1 16 1 1 6 6 5 3 132 2 2 2 2 1 1 1 3 1 1 1 1 1 1 4 30 4 1 1 1 1 1 1 12 1 1 1 1 1 1 1 1 1 3 6 3 3 1 1 2 5 1 3 1 1 1 1 1 2 2 1 2 7 2 1 1 1 3 338 177 160 14 8 43 31 10 9 10 9 10 9 15 2 16 1 17 17 17 17 17 15 4 15 2 17 1 1 2 2 2 2 2 117 115 1 5 3 1 1 3 1 1 1 2 2 2 1 1 1 2 5 1 1 1 2 6 1 7 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 34 1 2 1 1 1 22 9 23 7 26 31 1 1 1 1 45 44 204 87 116 7 7 7 2 2 7 7 6 7 1 1 85 86 86 86 80 8 5 5 2 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/md5.h> #include <crypto/utils.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/psp.h> #include <net/sock.h> #include <net/rstreason.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/hotdata.h> #include <trace/events/tcp.h> #include <net/rps.h> #include "../core/devmem.h" /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); DEFINE_PER_CPU(u32, tcp_tw_isn); EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_tcp_mem); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } #ifdef CONFIG_TCP_MD5SIG void tcp_md5_destruct_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (tp->md5sig_info) { tcp_clear_md5_list(sk); kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); static_branch_slow_dec_deferred(&tcp_md5_needed); } } EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); #endif /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } EXPORT_IPV6_MOD(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { struct sk_buff *skb = tcp_write_queue_tail(sk); u32 tsflags = sockc->tsflags; if (unlikely(!skb)) skb = skb_rb_last(&sk->tcp_rtx_queue); if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } /* @wake is one when sk_stream_write_space() calls us. * This sends EPOLLOUT only if notsent_bytes is half the limit. * This mimics the strategy used in sock_def_write_space(). */ bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } EXPORT_SYMBOL(tcp_stream_memory_free); static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } *karg = answ; return 0; } EXPORT_IPV6_MOD(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; ret = sk_wait_data(sk, &timeo, NULL); if (ret < 0) break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_IPV6_MOD(tcp_splice_read); /* We allow to exceed memory limits for FIN packets to expedite * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. * In general, we want to allow one skb per socket to avoid hangs * with edge trigger epoll() */ void sk_forced_mem_schedule(struct sock *sk, int size) { int delta, amt; delta = size - sk->sk_forward_alloc; if (delta <= 0) return; amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); if (mem_cgroup_sk_enabled(sk)) mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); if (sk->sk_bypass_prot_mem) return; sk_memory_allocated_add(sk, amt); } struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { if (!sk->sk_bypass_prot_mem) tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. Another reason is that tcp_write_xmit() does not like * finding an empty skb in the write queue. */ void tcp_remove_empty_skb(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_wmem_free_skb(sk, skb); } } /* skb changing from pure zc to mixed, must charge zc */ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_zcopy_pure(skb))) { u32 extra = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); if (!sk_wmem_schedule(sk, extra)) return -ENOMEM; sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } return 0; } int tcp_wmem_schedule(struct sock *sk, int copy) { int left; if (likely(sk_wmem_schedule(sk, copy))) return copy; /* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc_obj(struct tcp_fastopen_request, sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); } return err; } /* If a gap is detected between sends, mark the socket application-limited. */ void tcp_rate_check_app_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (/* We have less than one packet to send. */ tp->write_seq - tp->snd_nxt < tp->mss_cache && /* Nothing in sending host's qdisc queues or NIC tx queue. */ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && /* We are not limited by CWND. */ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && /* All lost packets have been retransmitted. */ tp->lost_out <= tp->retrans_out) tp->app_limited = (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; } EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int sockc_err = 0; int zc = 0; long timeo; flags = msg->msg_flags; sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; if (msg->msg_controllen) { sockc_err = sock_cmsg_send(sk, msg, &sockc); /* Don't return error until MSG_FASTOPEN has been processed; * that may succeed even if the cmsg is invalid. */ } if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), !sockc_err && sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; } if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; if (!sockc_err && sockc.dmabuf_id) { binding = net_devmem_get_binding(sk, sockc.dmabuf_id); if (IS_ERR(binding)) { err = PTR_ERR(binding); binding = NULL; goto out_err; } } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } if (!sockc_err && sockc.dmabuf_id && (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { err = -EINVAL; goto out_err; } if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } if (sockc_err) { err = sockc_err; goto out_err; } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { int copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; #ifdef CONFIG_SKB_DECRYPTED skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; skb_zcopy_downgrade_managed(skb); } copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ if (!skb->len) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } else if (zc == MSG_SPLICE_PAGES) { /* Splice in data if we can; copy if we can't. */ if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_splice_from_iter(skb, &msg->msg_iter, copy); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); goto new_segment; } goto do_error; } copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_remove_empty_skb(sk); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, &sockc); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); if (binding) net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { READ_ONCE(sk->sk_write_space)(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } if (binding) net_devmem_dmabuf_binding_put(binding); return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); void tcp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; if (!tcp_write_queue_tail(sk)) return; lock_sock(sk); mss_now = tcp_send_mss(sk, &size_goal, 0); tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } EXPORT_IPV6_MOD_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) { tcp_mstamp_refresh(tp); tcp_send_ack(sk); } } void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); __tcp_cleanup_rbuf(sk, copied); } static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); } __kfree_skb(skb); } struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ tcp_eat_recv_skb(sk, skb); } return NULL; } EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor, bool noack, u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = *copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(*copied_seq, seq); } WRITE_ONCE(*copied_seq, seq); if (noack) goto out; tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } out: return copied; } int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { return __tcp_read_sock(sk, desc, recv_actor, false, &tcp_sk(sk)->copied_seq); } EXPORT_SYMBOL(tcp_read_sock); int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor, bool noack, u32 *copied_seq) { return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); } int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); if (used < 0) { if (!copied) copied = used; break; } copied += used; if (tcp_flags & TCPHDR_FIN) break; } return copied; } EXPORT_IPV6_MOD(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; size_t left; u32 offset; if (sk->sk_state == TCP_LISTEN) return; left = len; while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { int used; used = min_t(size_t, skb->len - offset, left); seq += used; left -= used; if (skb->len > offset + used) break; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (left != len) tcp_cleanup_rbuf(sk, len - left); } EXPORT_SYMBOL(tcp_read_done); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); if (tp->window_clamp && tp->window_clamp < val) WRITE_ONCE(tp->window_clamp, val); } return 0; } EXPORT_IPV6_MOD(tcp_set_rcvlowat); #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_IPV6_MOD(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { struct page *page; if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) return false; page = skb_frag_page(frag); if (PageCompound(page) || page->mapping) return false; return true; } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_vma_range(vma, *address, maybe_zap_len); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_vma_range, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control_user); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); if (vma) { if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } *mmap_locked = false; return vma; } mmap_read_lock(mm); vma = vma_lookup(mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } *mmap_locked = true; return vma; } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); bool mmap_locked; int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } vma = find_tcp_vma(current->mm, address, &mmap_locked); if (!vma) return -EINVAL; vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_vma_range(vma, address, total_bytes_to_map); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (!skb_frags_readable(skb)) break; if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); if (WARN_ON_ONCE(!page)) break; prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: if (mmap_locked) mmap_read_unlock(current->mm); else vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); u32 tsflags = READ_ONCE(sk->sk_tsflags); if (tss->ts[0]) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { struct timespec64 tv = ktime_to_timespec64(tss->ts[0]); if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tv.tv_sec, .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tv.tv_sec, .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tv.tv_sec, .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval otv = { .tv_sec = tv.tv_sec, .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(otv), &otv); } } } if (!(tsflags & SOF_TIMESTAMPING_SOFTWARE && (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) tss->ts[0] = 0; } if (tss->ts[2]) { if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) tss->ts[2] = 0; } if (tss->ts[0] | tss->ts[2]) { tss->ts[1] = 0; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */ struct tcp_xa_pool { u8 max; /* max <= MAX_SKB_FRAGS */ u8 idx; /* idx <= max */ __u32 tokens[MAX_SKB_FRAGS]; netmem_ref netmems[MAX_SKB_FRAGS]; }; static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p) { int i; /* Commit part that has been copied to user space. */ for (i = 0; i < p->idx; i++) __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY, (__force void *)p->netmems[i], GFP_KERNEL); /* Rollback what has been pre-allocated and is no longer needed. */ for (; i < p->max; i++) __xa_erase(&sk->sk_user_frags, p->tokens[i]); p->max = 0; p->idx = 0; } static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p) { if (!p->max) return; xa_lock_bh(&sk->sk_user_frags); tcp_xa_pool_commit_locked(sk, p); xa_unlock_bh(&sk->sk_user_frags); } static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p, unsigned int max_frags) { int err, k; if (p->idx < p->max) return 0; xa_lock_bh(&sk->sk_user_frags); tcp_xa_pool_commit_locked(sk, p); for (k = 0; k < max_frags; k++) { err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k], XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL); if (err) break; } xa_unlock_bh(&sk->sk_user_frags); p->max = k; p->idx = 0; return k ? 0 : err; } /* On error, returns the -errno. On success, returns number of bytes sent to the * user. May not consume all of @remaining_len. */ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, unsigned int offset, struct msghdr *msg, int remaining_len) { struct dmabuf_cmsg dmabuf_cmsg = { 0 }; struct tcp_xa_pool tcp_xa_pool; unsigned int start; int i, copy, n; int sent = 0; int err = 0; tcp_xa_pool.max = 0; tcp_xa_pool.idx = 0; do { start = skb_headlen(skb); if (skb_frags_readable(skb)) { err = -ENODEV; goto out; } /* Copy header. */ copy = start - offset; if (copy > 0) { copy = min(copy, remaining_len); n = copy_to_iter(skb->data + offset, copy, &msg->msg_iter); if (n != copy) { err = -EFAULT; goto out; } offset += copy; remaining_len -= copy; /* First a dmabuf_cmsg for # bytes copied to user * buffer. */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; err = put_cmsg_notrunc(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, sizeof(dmabuf_cmsg), &dmabuf_cmsg); if (err) goto out; sent += copy; if (remaining_len == 0) goto out; } /* after that, send information of dmabuf pages through a * sequence of cmsg */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct net_iov *niov; u64 frag_offset; int end; /* !skb_frags_readable() should indicate that ALL the * frags in this skb are dmabuf net_iovs. We're checking * for that flag above, but also check individual frags * here. If the tcp stack is not setting * skb_frags_readable() correctly, we still don't want * to crash here. */ if (!skb_frag_net_iov(frag)) { net_err_ratelimited("Found non-dmabuf skb with net_iov"); err = -ENODEV; goto out; } niov = skb_frag_net_iov(frag); if (!net_is_devmem_iov(niov)) { err = -ENODEV; goto out; } end = start + skb_frag_size(frag); copy = end - offset; if (copy > 0) { copy = min(copy, remaining_len); frag_offset = net_iov_virtual_addr(niov) + skb_frag_off(frag) + offset - start; dmabuf_cmsg.frag_offset = frag_offset; dmabuf_cmsg.frag_size = copy; err = tcp_xa_pool_refill(sk, &tcp_xa_pool, skb_shinfo(skb)->nr_frags - i); if (err) goto out; /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; err = put_cmsg_notrunc(msg, SOL_SOCKET, SO_DEVMEM_DMABUF, sizeof(dmabuf_cmsg), &dmabuf_cmsg); if (err) goto out; atomic_long_inc(&niov->desc.pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); sent += copy; if (remaining_len == 0) goto out; } start = end; } tcp_xa_pool_commit(sk, &tcp_xa_pool); if (!remaining_len) goto out; /* if remaining_len is not satisfied yet, we need to go to the * next frag in the frag_list to satisfy remaining_len. */ skb = skb_shinfo(skb)->frag_list ?: skb->next; offset = offset - start; } while (skb); if (remaining_len) { err = -EFAULT; goto out; } out: tcp_xa_pool_commit(sk, &tcp_xa_pool); if (!sent) sent = err; return sent; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int last_copied_dmabuf = -1; /* uninitialized */ int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 peek_offset = 0; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) *cmsg_flags = TCP_CMSG_INQ; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_offset = max(sk_peek_offset(sk, flags), 0); peek_seq = tp->copied_seq + peek_offset; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (!timeo || sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); err = sk_wait_data(sk, &timeo, last); if (err < 0) { err = copied ? : err; goto out; } } if ((flags & MSG_PEEK) && (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq + peek_offset; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { if (last_copied_dmabuf != -1 && last_copied_dmabuf != !skb_frags_readable(skb)) break; if (skb_frags_readable(skb)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } else { if (!(flags & MSG_SOCK_DEVMEM)) { /* dmabuf skbs can only be received * with the MSG_SOCK_DEVMEM flag. */ if (!copied) copied = -EFAULT; break; } err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, used); if (err < 0) { if (!copied) copied = err; break; } used = err; } } last_copied_dmabuf = !skb_frags_readable(skb); WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; if (flags & MSG_PEEK) sk_peek_offset_fwd(sk, used); else sk_peek_offset_bwd(sk, used); tcp_rcv_space_adjust(sk); skip_copy: if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); if ((cmsg_flags | msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if ((cmsg_flags & TCP_CMSG_INQ) | msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; } EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE_WAIT: if (oldstate == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_IPV6_MOD(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } static bool tcp_out_of_memory(const struct sock *sk) { if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } bool tcp_check_oom(const struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { bool data_was_unread = false; struct sk_buff *skb; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) end_seq--; if (after(end_seq, tcp_sk(sk)->copied_seq)) data_was_unread = true; tcp_eat_recv_skb(sk, skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation, SK_RST_REASON_TCP_ABORT_ON_CLOSE); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_ABORT_ON_LINGER); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_ABORT_ON_MEMORY); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); } else if (tcp_need_reset(old_state)) { tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_DISCONNECT_WITH_DATA); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); sk_set_peek_off(sk, -1); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; tp->pkts_acked_ewma = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL))); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ req = rcu_dereference_protected(tp->fastopen_rsk, lockdep_sock_is_held(sk)); if (req) reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_IPV6_MOD(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); s32 delta = (val - tp->tcp_tx_delay) << 3; if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us, * tp->rtt_min, icsk_rto and sk->sk_pacing_rate. * This is best effort. */ if (delta && sk->sk_state == TCP_ESTABLISHED) { s64 srtt = (s64)tp->srtt_us + delta; tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); /* Note: does not deal with non zero icsk_backoff */ tcp_set_rto(sk); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); tcp_update_pacing_rate(sk); } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); int tcp_sock_set_user_timeout(struct sock *sk, int val) { /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; /* Paired with WRITE_ONCE() in keepalive_time_when() */ WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; tcp_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); return 0; } old_window_clamp = tp->window_clamp; new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); if (new_window_clamp == old_window_clamp) return 0; WRITE_ONCE(tp->window_clamp, new_window_clamp); /* Need to apply the reserved mem provisioning only * when shrinking the window clamp. */ if (new_window_clamp < old_window_clamp) { __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); } else { new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp); tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } return 0; } int tcp_sock_set_maxseg(struct sock *sk, int val) { /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); return 0; } /* * Socket option code for TCP. */ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); sockopt_release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; /* Handle options that can be set without locking the socket. */ switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); case TCP_LINGER2: if (val < 0) WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else WRITE_ONCE(tp->linger2, val * HZ); return 0; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; case TCP_RTO_MAX_MS: if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); return 0; case TCP_RTO_MIN_US: { int rto_min = usecs_to_jiffies(val); if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); return 0; } case TCP_DELACK_MAX_US: { int delack_max = usecs_to_jiffies(val); if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); return 0; } case TCP_MAXSEG: return tcp_sock_set_maxseg(sk, val); } sockopt_lock_sock(sk); switch (optname) { case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) { err = -EPERM; break; } err = tcp_ao_set_repair(sk, optval, optlen); break; #ifdef CONFIG_TCP_AO case TCP_AO_ADD_KEY: case TCP_AO_DEL_KEY: case TCP_AO_INFO: { /* If this is the first TCP-AO setsockopt() on the socket, * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR * in any state. */ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto ao_parse; if (rcu_dereference_protected(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk))) goto ao_parse; if (tp->repair) goto ao_parse; err = -EISCONN; break; ao_parse: err = tp->af_specific->ao_parse(sk, optname, optval, optlen); break; } #endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) { err = -EPERM; break; } /* val is an opaque field, * and low order bit contains usec_ts enable bit. * Its a best effort, and we do not care if user makes an error. */ tp->tcp_usec_ts = val & 1; WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); READ_ONCE(sk->sk_write_space)(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: /* tp->srtt_us is u32, and is shifted by 3 */ if (val < 0 || val >= (1U << (31 - 3))) { err = -EINVAL; break; } tcp_enable_tx_delay(sk, val); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_IPV6_MOD(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); const u8 ect1_idx = INET_ECN_ECT_1 - 1; const u8 ect0_idx = INET_ECN_ECT_0 - 1; const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; if (tp->syn_fastopen_child) info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, tcp_delack_max(sk))); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ info->tcpi_segs_in = READ_ONCE(tp->segs_in); info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; if (tcp_ecn_disabled(tp)) info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED; else if (tcp_ecn_mode_rfc3168(tp)) info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168; else if (tcp_ecn_mode_accecn(tp)) info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN; else if (tcp_ecn_mode_pending(tp)) info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, READ_ONCE(inet_csk(sk)->icsk_retransmits)); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int user_mss; int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; user_mss = READ_ONCE(tp->rx_opt.user_mss); if (user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = READ_ONCE(tp->window_clamp); break; case TCP_INFO: { struct tcp_info info; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); if (tp->tcp_usec_ts) val |= 1; else val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); sockopt_release_sock(sk); } else { sockopt_release_sock(sk); len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_sockptr(optval, sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } #endif case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) return -EPERM; return tcp_ao_get_repair(sk, optval, optlen); case TCP_AO_GET_KEYS: case TCP_AO_INFO: { int err; sockopt_lock_sock(sk); if (optname == TCP_AO_GET_KEYS) err = tcp_ao_get_mkts(sk, optval, optlen); else err = tcp_ao_get_sock_info(sk, optval, optlen); sockopt_release_sock(sk); return err; } case TCP_IS_MPTCP: val = 0; break; case TCP_RTO_MAX_MS: val = jiffies_to_msecs(tcp_rto_max(sk)); break; case TCP_RTO_MIN_US: val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); break; case TCP_DELACK_MAX_US: val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); break; default: return -ENOPROTOOPT; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, unsigned int header_len) { const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); struct sk_buff *frag_iter; unsigned int i; md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len); for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; u32 p_off, p_len, copied; const void *vaddr; struct page *p; skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), p, p_off, p_len, copied) { vaddr = kmap_local_page(p); md5_update(ctx, vaddr + p_off, p_len); kunmap_local(vaddr); } } skb_walk_frags(skb, frag_iter) tcp_md5_hash_skb_data(ctx, frag_iter, 0); } EXPORT_IPV6_MOD(tcp_md5_hash_skb_data); void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ data_race(({ md5_update(ctx, key->key, keylen), 0; })); } EXPORT_IPV6_MOD(tcp_md5_hash_key); /* Called with rcu_read_lock() */ static enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. * We have 2 drop cases: * o An MD5 signature is present, but we're not expecting one. * o The MD5 signature is wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; key = tcp_md5_do_lookup(sk, l3index, saddr, family); if (!key) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* Check the signature. * To support dual stack listeners, we need to handle * IPv4-mapped case. */ if (family == AF_INET) tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); if (crypto_memneq(hash_location, newhash, 16)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } #else static inline enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { return SKB_NOT_DROPPED_YET; } #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) /* * Parse Signature options */ int tcp_do_parse_auth_options(const struct tcphdr *th, const u8 **md5_hash, const u8 **ao_hash) { int length = (th->doff << 2) - sizeof(*th); const u8 *ptr = (const u8 *)(th + 1); unsigned int minlen = TCPOLEN_MD5SIG; if (IS_ENABLED(CONFIG_TCP_AO)) minlen = sizeof(struct tcp_ao_hdr) + 1; *md5_hash = NULL; *ao_hash = NULL; /* If not enough data remaining, we can short cut */ while (length >= minlen) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return 0; case TCPOPT_NOP: length--; continue; default: opsize = *ptr++; if (opsize < 2 || opsize > length) return -EINVAL; if (opcode == TCPOPT_MD5SIG) { if (opsize != TCPOLEN_MD5SIG) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *md5_hash = ptr; } else if (opcode == TCPOPT_AO) { if (opsize <= sizeof(struct tcp_ao_hdr)) return -EINVAL; if (unlikely(*md5_hash || *ao_hash)) return -EEXIST; *ao_hash = ptr; } } ptr += opsize - 2; length -= opsize; } return 0; } EXPORT_IPV6_MOD(tcp_do_parse_auth_options); #endif /* Called with rcu_read_lock() */ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const struct request_sock *req, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int dif, int sdif) { const struct tcphdr *th = tcp_hdr(skb); const struct tcp_ao_hdr *aoh; const __u8 *md5_location; int l3index; /* Invalid option or two times meet any of auth options */ if (tcp_parse_auth_options(th, &md5_location, &aoh)) { trace_tcp_hash_bad_header(sk, skb); return SKB_DROP_REASON_TCP_AUTH_HDR; } if (req) { if (tcp_rsk_used_ao(req) != !!aoh) { u8 keyid, rnext, maclen; if (aoh) { keyid = aoh->keyid; rnext = aoh->rnext_keyid; maclen = tcp_ao_hdr_maclen(aoh); } else { keyid = rnext = maclen = 0; } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD); trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen); return SKB_DROP_REASON_TCP_AOFAILURE; } } /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to the l3mdev */ l3index = sdif ? dif : 0; /* Fast path: unsigned segments */ if (likely(!md5_location && !aoh)) { /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid * for the remote peer. On TCP-AO established connection * the last key is impossible to remove, so there's * always at least one current_key. */ if (tcp_ao_required(sk, saddr, family, l3index, true)) { trace_tcp_hash_ao_required(sk, skb); return SKB_DROP_REASON_TCP_AONOTFOUND; } if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); trace_tcp_hash_md5_required(sk, skb); return SKB_DROP_REASON_TCP_MD5NOTFOUND; } return SKB_NOT_DROPPED_YET; } if (aoh) return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh); return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, l3index, md5_location); } EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { int state = inet_sk_state_load(sk); if (state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } if (state == TCP_TIME_WAIT) { struct inet_timewait_sock *tw = inet_twsk(sk); refcount_inc(&tw->tw_refcnt); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* BPF context ensures sock locking. */ if (!has_current_bpf_ctx()) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); /* Avoid closing the same socket twice. */ if (sk->sk_state == TCP_CLOSE) { if (!has_current_bpf_ctx()) release_sock(sk); return -ENOENT; } if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE); tcp_done_with_error(sk, err); bh_unlock_sock(sk); local_bh_enable(); if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } static void __init tcp_struct_check(void) { /* TX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); #if IS_ENABLED(CONFIG_TLS_DEVICE) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); #endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); tcp_struct_check(); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); tcp_hashinfo.bind2_bucket_cachep = kmem_cache_create("tcp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); spin_lock_init(&tcp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(32UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tsq_work_init(); mptcp_init(); }
2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 // SPDX-License-Identifier: GPL-2.0-only /* * scsi_sysfs.c * * SCSI sysfs interface routines. * * Created to pull SCSI mid layer sysfs routines into one file. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/device.h> #include <linux/pm_runtime.h> #include <linux/bsg.h> #include <scsi/scsi.h> #include <scsi/scsi_device.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include <scsi/scsi_dh.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_devinfo.h> #include "scsi_priv.h" #include "scsi_logging.h" static const struct device_type scsi_dev_type; static const struct { enum scsi_device_state value; char *name; } sdev_states[] = { { SDEV_CREATED, "created" }, { SDEV_RUNNING, "running" }, { SDEV_CANCEL, "cancel" }, { SDEV_DEL, "deleted" }, { SDEV_QUIESCE, "quiesce" }, { SDEV_OFFLINE, "offline" }, { SDEV_TRANSPORT_OFFLINE, "transport-offline" }, { SDEV_BLOCK, "blocked" }, { SDEV_CREATED_BLOCK, "created-blocked" }, }; const char *scsi_device_state_name(enum scsi_device_state state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { if (sdev_states[i].value == state) { name = sdev_states[i].name; break; } } return name; } static const struct { enum scsi_host_state value; char *name; } shost_states[] = { { SHOST_CREATED, "created" }, { SHOST_RUNNING, "running" }, { SHOST_CANCEL, "cancel" }, { SHOST_DEL, "deleted" }, { SHOST_RECOVERY, "recovery" }, { SHOST_CANCEL_RECOVERY, "cancel/recovery" }, { SHOST_DEL_RECOVERY, "deleted/recovery", }, }; const char *scsi_host_state_name(enum scsi_host_state state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(shost_states); i++) { if (shost_states[i].value == state) { name = shost_states[i].name; break; } } return name; } #ifdef CONFIG_SCSI_DH static const struct { unsigned char value; char *name; } sdev_access_states[] = { { SCSI_ACCESS_STATE_OPTIMAL, "active/optimized" }, { SCSI_ACCESS_STATE_ACTIVE, "active/non-optimized" }, { SCSI_ACCESS_STATE_STANDBY, "standby" }, { SCSI_ACCESS_STATE_UNAVAILABLE, "unavailable" }, { SCSI_ACCESS_STATE_LBA, "lba-dependent" }, { SCSI_ACCESS_STATE_OFFLINE, "offline" }, { SCSI_ACCESS_STATE_TRANSITIONING, "transitioning" }, }; static const char *scsi_access_state_name(unsigned char state) { int i; char *name = NULL; for (i = 0; i < ARRAY_SIZE(sdev_access_states); i++) { if (sdev_access_states[i].value == state) { name = sdev_access_states[i].name; break; } } return name; } #endif static int check_set(unsigned long long *val, char *src) { char *last; if (strcmp(src, "-") == 0) { *val = SCAN_WILD_CARD; } else { /* * Doesn't check for int overflow */ *val = simple_strtoull(src, &last, 0); if (*last != '\0') return 1; } return 0; } static int scsi_scan(struct Scsi_Host *shost, const char *str) { char s1[15], s2[15], s3[17], junk; unsigned long long channel, id, lun; int res; res = sscanf(str, "%10s %10s %16s %c", s1, s2, s3, &junk); if (res != 3) return -EINVAL; if (check_set(&channel, s1)) return -EINVAL; if (check_set(&id, s2)) return -EINVAL; if (check_set(&lun, s3)) return -EINVAL; if (shost->transportt->user_scan) res = shost->transportt->user_scan(shost, channel, id, lun); else res = scsi_scan_host_selected(shost, channel, id, lun, SCSI_SCAN_MANUAL); return res; } /* * shost_show_function: macro to create an attr function that can be used to * show a non-bit field. */ #define shost_show_function(name, field, format_string) \ static ssize_t \ show_##name (struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct Scsi_Host *shost = class_to_shost(dev); \ return snprintf (buf, 20, format_string, shost->field); \ } /* * shost_rd_attr: macro to create a function and attribute variable for a * read only field. */ #define shost_rd_attr2(name, field, format_string) \ shost_show_function(name, field, format_string) \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL); #define shost_rd_attr(field, format_string) \ shost_rd_attr2(field, field, format_string) /* * Create the actual show/store functions and data structures. */ static ssize_t store_scan(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); int res; res = scsi_scan(shost, buf); if (res == 0) res = count; return res; }; static DEVICE_ATTR(scan, S_IWUSR, NULL, store_scan); static ssize_t store_shost_state(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; struct Scsi_Host *shost = class_to_shost(dev); enum scsi_host_state state = 0; for (i = 0; i < ARRAY_SIZE(shost_states); i++) { const int len = strlen(shost_states[i].name); if (strncmp(shost_states[i].name, buf, len) == 0 && buf[len] == '\n') { state = shost_states[i].value; break; } } if (!state) return -EINVAL; if (scsi_host_set_state(shost, state)) return -EINVAL; return count; } static ssize_t show_shost_state(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); const char *name = scsi_host_state_name(shost->shost_state); if (!name) return -EINVAL; return snprintf(buf, 20, "%s\n", name); } /* DEVICE_ATTR(state) clashes with dev_attr_state for sdev */ static struct device_attribute dev_attr_hstate = __ATTR(state, S_IRUGO | S_IWUSR, show_shost_state, store_shost_state); static ssize_t show_shost_mode(unsigned int mode, char *buf) { ssize_t len = 0; if (mode & MODE_INITIATOR) len = sprintf(buf, "%s", "Initiator"); if (mode & MODE_TARGET) len += sprintf(buf + len, "%s%s", len ? ", " : "", "Target"); len += sprintf(buf + len, "\n"); return len; } static ssize_t show_shost_supported_mode(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); unsigned int supported_mode = shost->hostt->supported_mode; if (supported_mode == MODE_UNKNOWN) /* by default this should be initiator */ supported_mode = MODE_INITIATOR; return show_shost_mode(supported_mode, buf); } static DEVICE_ATTR(supported_mode, S_IRUGO, show_shost_supported_mode, NULL); static ssize_t show_shost_active_mode(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); if (shost->active_mode == MODE_UNKNOWN) return snprintf(buf, 20, "unknown\n"); else return show_shost_mode(shost->active_mode, buf); } static DEVICE_ATTR(active_mode, S_IRUGO, show_shost_active_mode, NULL); static int check_reset_type(const char *str) { if (sysfs_streq(str, "adapter")) return SCSI_ADAPTER_RESET; else if (sysfs_streq(str, "firmware")) return SCSI_FIRMWARE_RESET; else return 0; } static ssize_t store_host_reset(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); const struct scsi_host_template *sht = shost->hostt; int ret = -EINVAL; int type; type = check_reset_type(buf); if (!type) goto exit_store_host_reset; if (sht->host_reset) ret = sht->host_reset(shost, type); else ret = -EOPNOTSUPP; exit_store_host_reset: if (ret == 0) ret = count; return ret; } static DEVICE_ATTR(host_reset, S_IWUSR, NULL, store_host_reset); static ssize_t show_shost_eh_deadline(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); if (shost->eh_deadline == -1) return snprintf(buf, strlen("off") + 2, "off\n"); return sprintf(buf, "%u\n", shost->eh_deadline / HZ); } static ssize_t store_shost_eh_deadline(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct Scsi_Host *shost = class_to_shost(dev); int ret = -EINVAL; unsigned long deadline, flags; if (shost->transportt && (shost->transportt->eh_strategy_handler || !shost->hostt->eh_host_reset_handler)) return ret; if (!strncmp(buf, "off", strlen("off"))) deadline = -1; else { ret = kstrtoul(buf, 10, &deadline); if (ret) return ret; if (deadline * HZ > UINT_MAX) return -EINVAL; } spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_in_recovery(shost)) ret = -EBUSY; else { if (deadline == -1) shost->eh_deadline = -1; else shost->eh_deadline = deadline * HZ; ret = count; } spin_unlock_irqrestore(shost->host_lock, flags); return ret; } static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline); shost_rd_attr(unique_id, "%u\n"); shost_rd_attr(cmd_per_lun, "%hd\n"); shost_rd_attr(can_queue, "%d\n"); shost_rd_attr(sg_tablesize, "%hu\n"); shost_rd_attr(sg_prot_tablesize, "%hu\n"); shost_rd_attr(prot_capabilities, "%u\n"); shost_rd_attr(prot_guard_type, "%hd\n"); shost_rd_attr2(proc_name, hostt->proc_name, "%s\n"); static ssize_t show_host_busy(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); return snprintf(buf, 20, "%d\n", scsi_host_busy(shost)); } static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL); static ssize_t show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "1\n"); } static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL); static ssize_t show_nr_hw_queues(struct device *dev, struct device_attribute *attr, char *buf) { struct Scsi_Host *shost = class_to_shost(dev); struct blk_mq_tag_set *tag_set = &shost->tag_set; return snprintf(buf, 20, "%d\n", tag_set->nr_hw_queues); } static DEVICE_ATTR(nr_hw_queues, S_IRUGO, show_nr_hw_queues, NULL); static struct attribute *scsi_sysfs_shost_attrs[] = { &dev_attr_use_blk_mq.attr, &dev_attr_unique_id.attr, &dev_attr_host_busy.attr, &dev_attr_cmd_per_lun.attr, &dev_attr_can_queue.attr, &dev_attr_sg_tablesize.attr, &dev_attr_sg_prot_tablesize.attr, &dev_attr_proc_name.attr, &dev_attr_scan.attr, &dev_attr_hstate.attr, &dev_attr_supported_mode.attr, &dev_attr_active_mode.attr, &dev_attr_prot_capabilities.attr, &dev_attr_prot_guard_type.attr, &dev_attr_host_reset.attr, &dev_attr_eh_deadline.attr, &dev_attr_nr_hw_queues.attr, NULL }; static const struct attribute_group scsi_shost_attr_group = { .attrs = scsi_sysfs_shost_attrs, }; const struct attribute_group *scsi_shost_groups[] = { &scsi_shost_attr_group, NULL }; static void scsi_device_cls_release(struct device *class_dev) { struct scsi_device *sdev; sdev = class_to_sdev(class_dev); put_device(&sdev->sdev_gendev); } static void scsi_device_dev_release(struct device *dev) { struct scsi_device *sdev = to_scsi_device(dev); struct device *parent; struct list_head *this, *tmp; struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; struct scsi_vpd *vpd_pg0 = NULL, *vpd_pg89 = NULL; struct scsi_vpd *vpd_pgb0 = NULL, *vpd_pgb1 = NULL, *vpd_pgb2 = NULL; struct scsi_vpd *vpd_pgb7 = NULL; unsigned long flags; might_sleep(); scsi_dh_release_device(sdev); parent = sdev->sdev_gendev.parent; spin_lock_irqsave(sdev->host->host_lock, flags); list_del(&sdev->siblings); list_del(&sdev->same_target_siblings); list_del(&sdev->starved_entry); spin_unlock_irqrestore(sdev->host->host_lock, flags); cancel_work_sync(&sdev->event_work); list_for_each_safe(this, tmp, &sdev->event_list) { struct scsi_event *evt; evt = list_entry(this, struct scsi_event, node); list_del(&evt->node); kfree(evt); } blk_put_queue(sdev->request_queue); /* NULL queue means the device can't be used */ sdev->request_queue = NULL; sbitmap_free(&sdev->budget_map); mutex_lock(&sdev->inquiry_mutex); vpd_pg0 = rcu_replace_pointer(sdev->vpd_pg0, vpd_pg0, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg80 = rcu_replace_pointer(sdev->vpd_pg80, vpd_pg80, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg83 = rcu_replace_pointer(sdev->vpd_pg83, vpd_pg83, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pg89 = rcu_replace_pointer(sdev->vpd_pg89, vpd_pg89, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb0 = rcu_replace_pointer(sdev->vpd_pgb0, vpd_pgb0, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb1 = rcu_replace_pointer(sdev->vpd_pgb1, vpd_pgb1, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb2 = rcu_replace_pointer(sdev->vpd_pgb2, vpd_pgb2, lockdep_is_held(&sdev->inquiry_mutex)); vpd_pgb7 = rcu_replace_pointer(sdev->vpd_pgb7, vpd_pgb7, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_pg0) kfree_rcu(vpd_pg0, rcu); if (vpd_pg83) kfree_rcu(vpd_pg83, rcu); if (vpd_pg80) kfree_rcu(vpd_pg80, rcu); if (vpd_pg89) kfree_rcu(vpd_pg89, rcu); if (vpd_pgb0) kfree_rcu(vpd_pgb0, rcu); if (vpd_pgb1) kfree_rcu(vpd_pgb1, rcu); if (vpd_pgb2) kfree_rcu(vpd_pgb2, rcu); if (vpd_pgb7) kfree_rcu(vpd_pgb7, rcu); kfree(sdev->inquiry); kfree(sdev); if (parent) put_device(parent); } static struct class sdev_class = { .name = "scsi_device", .dev_release = scsi_device_cls_release, }; /* all probing is done in the individual ->probe routines */ static int scsi_bus_match(struct device *dev, const struct device_driver *gendrv) { struct scsi_device *sdp; if (dev->type != &scsi_dev_type) return 0; sdp = to_scsi_device(dev); if (sdp->no_uld_attach) return 0; return (sdp->inq_periph_qual == SCSI_INQ_PQ_CON)? 1: 0; } static int scsi_bus_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct scsi_device *sdev; if (dev->type != &scsi_dev_type) return 0; sdev = to_scsi_device(dev); add_uevent_var(env, "MODALIAS=" SCSI_DEVICE_MODALIAS_FMT, sdev->type); return 0; } static int scsi_bus_probe(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_driver *drv = to_scsi_driver(dev->driver); if (drv->probe) return drv->probe(sdp); else return 0; } static void scsi_bus_remove(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_driver *drv = to_scsi_driver(dev->driver); if (drv->remove) drv->remove(sdp); } static void scsi_bus_shutdown(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_driver *drv; if (!dev->driver) return; drv = to_scsi_driver(dev->driver); if (drv->shutdown) drv->shutdown(sdp); } const struct bus_type scsi_bus_type = { .name = "scsi", .match = scsi_bus_match, .uevent = scsi_bus_uevent, .probe = scsi_bus_probe, .remove = scsi_bus_remove, .shutdown = scsi_bus_shutdown, #ifdef CONFIG_PM .pm = &scsi_bus_pm_ops, #endif }; int scsi_sysfs_register(void) { int error; error = bus_register(&scsi_bus_type); if (!error) { error = class_register(&sdev_class); if (error) bus_unregister(&scsi_bus_type); } return error; } void scsi_sysfs_unregister(void) { class_unregister(&sdev_class); bus_unregister(&scsi_bus_type); } /* * sdev_show_function: macro to create an attr function that can be used to * show a non-bit field. */ #define sdev_show_function(field, format_string) \ static ssize_t \ sdev_show_##field (struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev; \ sdev = to_scsi_device(dev); \ return snprintf (buf, 20, format_string, sdev->field); \ } \ /* * sdev_rd_attr: macro to create a function and attribute variable for a * read only field. */ #define sdev_rd_attr(field, format_string) \ sdev_show_function(field, format_string) \ static DEVICE_ATTR(field, S_IRUGO, sdev_show_##field, NULL); /* * Create the actual show/store functions and data structures. */ sdev_rd_attr (type, "%d\n"); sdev_rd_attr (scsi_level, "%d\n"); sdev_rd_attr (vendor, "%.8s\n"); sdev_rd_attr (model, "%.16s\n"); sdev_rd_attr (rev, "%.4s\n"); sdev_rd_attr (cdl_supported, "%d\n"); static ssize_t sdev_show_device_busy(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", scsi_device_busy(sdev)); } static DEVICE_ATTR(device_busy, S_IRUGO, sdev_show_device_busy, NULL); static ssize_t sdev_show_device_blocked(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", atomic_read(&sdev->device_blocked)); } static DEVICE_ATTR(device_blocked, S_IRUGO, sdev_show_device_blocked, NULL); /* * TODO: can we make these symlinks to the block layer ones? */ static ssize_t sdev_show_timeout (struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%d\n", sdev->request_queue->rq_timeout / HZ); } static ssize_t sdev_store_timeout (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); int ret, timeout; ret = kstrtoint(buf, 0, &timeout); if (ret) return ret; if (timeout <= 0) return -EINVAL; blk_queue_rq_timeout(sdev->request_queue, timeout * HZ); return count; } static DEVICE_ATTR(timeout, S_IRUGO | S_IWUSR, sdev_show_timeout, sdev_store_timeout); static ssize_t sdev_show_eh_timeout(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%u\n", sdev->eh_timeout / HZ); } static ssize_t sdev_store_eh_timeout(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev; unsigned int eh_timeout; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; sdev = to_scsi_device(dev); err = kstrtouint(buf, 10, &eh_timeout); if (err) return err; sdev->eh_timeout = eh_timeout * HZ; return count; } static DEVICE_ATTR(eh_timeout, S_IRUGO | S_IWUSR, sdev_show_eh_timeout, sdev_store_eh_timeout); static ssize_t store_rescan_field (struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { scsi_rescan_device(to_scsi_device(dev)); return count; } static DEVICE_ATTR(rescan, S_IWUSR, NULL, store_rescan_field); static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct kernfs_node *kn; struct scsi_device *sdev = to_scsi_device(dev); /* * We need to try to get module, avoiding the module been removed * during delete. */ if (scsi_device_get(sdev)) return -ENODEV; kn = sysfs_break_active_protection(&dev->kobj, &attr->attr); WARN_ON_ONCE(!kn); /* * Concurrent writes into the "delete" sysfs attribute may trigger * concurrent calls to device_remove_file() and scsi_remove_device(). * device_remove_file() handles concurrent removal calls by * serializing these and by ignoring the second and later removal * attempts. Concurrent calls of scsi_remove_device() are * serialized. The second and later calls of scsi_remove_device() are * ignored because the first call of that function changes the device * state into SDEV_DEL. */ device_remove_file(dev, attr); scsi_remove_device(sdev); if (kn) sysfs_unbreak_active_protection(kn); scsi_device_put(sdev); return count; }; static DEVICE_ATTR(delete, S_IWUSR, NULL, sdev_store_delete); static ssize_t store_state_field(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i, ret; struct scsi_device *sdev = to_scsi_device(dev); enum scsi_device_state state = 0; bool rescan_dev = false; for (i = 0; i < ARRAY_SIZE(sdev_states); i++) { const int len = strlen(sdev_states[i].name); if (strncmp(sdev_states[i].name, buf, len) == 0 && buf[len] == '\n') { state = sdev_states[i].value; break; } } switch (state) { case SDEV_RUNNING: case SDEV_OFFLINE: break; default: return -EINVAL; } mutex_lock(&sdev->state_mutex); switch (sdev->sdev_state) { case SDEV_RUNNING: case SDEV_OFFLINE: break; default: mutex_unlock(&sdev->state_mutex); return -EINVAL; } if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { ret = 0; } else { ret = scsi_device_set_state(sdev, state); if (ret == 0 && state == SDEV_RUNNING) rescan_dev = true; } mutex_unlock(&sdev->state_mutex); if (rescan_dev) { /* * If the device state changes to SDEV_RUNNING, we need to * run the queue to avoid I/O hang, and rescan the device * to revalidate it. Running the queue first is necessary * because another thread may be waiting inside * blk_mq_freeze_queue_wait() and because that call may be * waiting for pending I/O to finish. */ blk_mq_run_hw_queues(sdev->request_queue, true); scsi_rescan_device(sdev); } return ret == 0 ? count : -EINVAL; } static ssize_t show_state_field(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); const char *name = scsi_device_state_name(sdev->sdev_state); if (!name) return -EINVAL; return snprintf(buf, 20, "%s\n", name); } static DEVICE_ATTR(state, S_IRUGO | S_IWUSR, show_state_field, store_state_field); static ssize_t show_queue_type_field(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); const char *name = "none"; if (sdev->simple_tags) name = "simple"; return snprintf(buf, 20, "%s\n", name); } static ssize_t store_queue_type_field(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->tagged_supported) return -EINVAL; sdev_printk(KERN_INFO, sdev, "ignoring write to deprecated queue_type attribute"); return count; } static DEVICE_ATTR(queue_type, S_IRUGO | S_IWUSR, show_queue_type_field, store_queue_type_field); #define sdev_vpd_pg_attr(_page) \ static ssize_t \ show_vpd_##_page(struct file *filp, struct kobject *kobj, \ const struct bin_attribute *bin_attr, \ char *buf, loff_t off, size_t count) \ { \ struct device *dev = kobj_to_dev(kobj); \ struct scsi_device *sdev = to_scsi_device(dev); \ struct scsi_vpd *vpd_page; \ int ret = -EINVAL; \ \ rcu_read_lock(); \ vpd_page = rcu_dereference(sdev->vpd_##_page); \ if (vpd_page) \ ret = memory_read_from_buffer(buf, count, &off, \ vpd_page->data, vpd_page->len); \ rcu_read_unlock(); \ return ret; \ } \ static const struct bin_attribute dev_attr_vpd_##_page = { \ .attr = {.name = __stringify(vpd_##_page), .mode = S_IRUGO }, \ .size = 0, \ .read = show_vpd_##_page, \ }; sdev_vpd_pg_attr(pg83); sdev_vpd_pg_attr(pg80); sdev_vpd_pg_attr(pg89); sdev_vpd_pg_attr(pgb0); sdev_vpd_pg_attr(pgb1); sdev_vpd_pg_attr(pgb2); sdev_vpd_pg_attr(pgb7); sdev_vpd_pg_attr(pg0); static ssize_t show_inquiry(struct file *filep, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->inquiry) return -EINVAL; return memory_read_from_buffer(buf, count, &off, sdev->inquiry, sdev->inquiry_len); } static const struct bin_attribute dev_attr_inquiry = { .attr = { .name = "inquiry", .mode = S_IRUGO, }, .size = 0, .read = show_inquiry, }; static ssize_t show_iostat_counterbits(struct device *dev, struct device_attribute *attr, char *buf) { return snprintf(buf, 20, "%d\n", (int)sizeof(atomic_t) * 8); } static DEVICE_ATTR(iocounterbits, S_IRUGO, show_iostat_counterbits, NULL); #define show_sdev_iostat(field) \ static ssize_t \ show_iostat_##field(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ unsigned long long count = atomic_read(&sdev->field); \ return snprintf(buf, 20, "0x%llx\n", count); \ } \ static DEVICE_ATTR(field, S_IRUGO, show_iostat_##field, NULL) show_sdev_iostat(iorequest_cnt); show_sdev_iostat(iodone_cnt); show_sdev_iostat(ioerr_cnt); show_sdev_iostat(iotmo_cnt); static ssize_t sdev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf (buf, 20, SCSI_DEVICE_MODALIAS_FMT "\n", sdev->type); } static DEVICE_ATTR(modalias, S_IRUGO, sdev_show_modalias, NULL); #define DECLARE_EVT_SHOW(name, Cap_name) \ static ssize_t \ sdev_show_evt_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ int val = test_bit(SDEV_EVT_##Cap_name, sdev->supported_events);\ return snprintf(buf, 20, "%d\n", val); \ } #define DECLARE_EVT_STORE(name, Cap_name) \ static ssize_t \ sdev_store_evt_##name(struct device *dev, struct device_attribute *attr,\ const char *buf, size_t count) \ { \ struct scsi_device *sdev = to_scsi_device(dev); \ int val = simple_strtoul(buf, NULL, 0); \ if (val == 0) \ clear_bit(SDEV_EVT_##Cap_name, sdev->supported_events); \ else if (val == 1) \ set_bit(SDEV_EVT_##Cap_name, sdev->supported_events); \ else \ return -EINVAL; \ return count; \ } #define DECLARE_EVT(name, Cap_name) \ DECLARE_EVT_SHOW(name, Cap_name) \ DECLARE_EVT_STORE(name, Cap_name) \ static DEVICE_ATTR(evt_##name, S_IRUGO, sdev_show_evt_##name, \ sdev_store_evt_##name); #define REF_EVT(name) &dev_attr_evt_##name.attr DECLARE_EVT(media_change, MEDIA_CHANGE) DECLARE_EVT(inquiry_change_reported, INQUIRY_CHANGE_REPORTED) DECLARE_EVT(capacity_change_reported, CAPACITY_CHANGE_REPORTED) DECLARE_EVT(soft_threshold_reached, SOFT_THRESHOLD_REACHED_REPORTED) DECLARE_EVT(mode_parameter_change_reported, MODE_PARAMETER_CHANGE_REPORTED) DECLARE_EVT(lun_change_reported, LUN_CHANGE_REPORTED) static ssize_t sdev_store_queue_depth(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int depth, retval; struct scsi_device *sdev = to_scsi_device(dev); const struct scsi_host_template *sht = sdev->host->hostt; if (!sht->change_queue_depth) return -EINVAL; depth = simple_strtoul(buf, NULL, 0); if (depth < 1 || depth > sdev->host->can_queue) return -EINVAL; retval = sht->change_queue_depth(sdev, depth); if (retval < 0) return retval; sdev->max_queue_depth = sdev->queue_depth; return count; } sdev_show_function(queue_depth, "%d\n"); static DEVICE_ATTR(queue_depth, S_IRUGO | S_IWUSR, sdev_show_queue_depth, sdev_store_queue_depth); static ssize_t sdev_show_wwid(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); ssize_t count; count = scsi_vpd_lun_id(sdev, buf, PAGE_SIZE); if (count > 0) { buf[count] = '\n'; count++; } return count; } static DEVICE_ATTR(wwid, S_IRUGO, sdev_show_wwid, NULL); static ssize_t sdev_show_serial(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); ssize_t ret; ret = scsi_vpd_lun_serial(sdev, buf, PAGE_SIZE - 1); if (ret < 0) return ret; buf[ret] = '\n'; return ret + 1; } static DEVICE_ATTR(serial, S_IRUGO, sdev_show_serial, NULL); #define BLIST_FLAG_NAME(name) \ [const_ilog2((__force __u64)BLIST_##name)] = #name static const char *const sdev_bflags_name[] = { #include "scsi_devinfo_tbl.c" }; #undef BLIST_FLAG_NAME static ssize_t sdev_show_blacklist(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); int i; ssize_t len = 0; for (i = 0; i < sizeof(sdev->sdev_bflags) * BITS_PER_BYTE; i++) { const char *name = NULL; if (!(sdev->sdev_bflags & (__force blist_flags_t)BIT(i))) continue; if (i < ARRAY_SIZE(sdev_bflags_name) && sdev_bflags_name[i]) name = sdev_bflags_name[i]; if (name) len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? " " : "", name); else len += scnprintf(buf + len, PAGE_SIZE - len, "%sINVALID_BIT(%d)", len ? " " : "", i); } if (len) len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } static DEVICE_ATTR(blacklist, S_IRUGO, sdev_show_blacklist, NULL); #ifdef CONFIG_SCSI_DH static ssize_t sdev_show_dh_state(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->handler) return snprintf(buf, 20, "detached\n"); return snprintf(buf, 20, "%s\n", sdev->handler->name); } static ssize_t sdev_store_dh_state(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); int err = -EINVAL; if (sdev->sdev_state == SDEV_CANCEL || sdev->sdev_state == SDEV_DEL) return -ENODEV; if (!sdev->handler) { /* * Attach to a device handler */ err = scsi_dh_attach(sdev->request_queue, buf); } else if (!strncmp(buf, "activate", 8)) { /* * Activate a device handler */ if (sdev->handler->activate) err = sdev->handler->activate(sdev, NULL, NULL); else err = 0; } else if (!strncmp(buf, "detach", 6)) { /* * Detach from a device handler */ sdev_printk(KERN_WARNING, sdev, "can't detach handler %s.\n", sdev->handler->name); err = -EINVAL; } return err < 0 ? err : count; } static DEVICE_ATTR(dh_state, S_IRUGO | S_IWUSR, sdev_show_dh_state, sdev_store_dh_state); static ssize_t sdev_show_access_state(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); unsigned char access_state; const char *access_state_name; if (!sdev->handler) return -EINVAL; access_state = (sdev->access_state & SCSI_ACCESS_STATE_MASK); access_state_name = scsi_access_state_name(access_state); return sprintf(buf, "%s\n", access_state_name ? access_state_name : "unknown"); } static DEVICE_ATTR(access_state, S_IRUGO, sdev_show_access_state, NULL); static ssize_t sdev_show_preferred_path(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); if (!sdev->handler) return -EINVAL; if (sdev->access_state & SCSI_ACCESS_STATE_PREFERRED) return sprintf(buf, "1\n"); else return sprintf(buf, "0\n"); } static DEVICE_ATTR(preferred_path, S_IRUGO, sdev_show_preferred_path, NULL); #endif static ssize_t sdev_show_queue_ramp_up_period(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev; sdev = to_scsi_device(dev); return snprintf(buf, 20, "%u\n", jiffies_to_msecs(sdev->queue_ramp_up_period)); } static ssize_t sdev_store_queue_ramp_up_period(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_device *sdev = to_scsi_device(dev); unsigned int period; if (kstrtouint(buf, 10, &period)) return -EINVAL; sdev->queue_ramp_up_period = msecs_to_jiffies(period); return count; } static DEVICE_ATTR(queue_ramp_up_period, S_IRUGO | S_IWUSR, sdev_show_queue_ramp_up_period, sdev_store_queue_ramp_up_period); static ssize_t sdev_show_cdl_enable(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_device *sdev = to_scsi_device(dev); return sysfs_emit(buf, "%d\n", (int)sdev->cdl_enable); } static ssize_t sdev_store_cdl_enable(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ret; bool v; if (kstrtobool(buf, &v)) return -EINVAL; ret = scsi_cdl_enable(to_scsi_device(dev), v); if (ret) return ret; return count; } static DEVICE_ATTR(cdl_enable, S_IRUGO | S_IWUSR, sdev_show_cdl_enable, sdev_store_cdl_enable); static umode_t scsi_sdev_attr_is_visible(struct kobject *kobj, struct attribute *attr, int i) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (attr == &dev_attr_queue_depth.attr && !sdev->host->hostt->change_queue_depth) return S_IRUGO; if (attr == &dev_attr_queue_ramp_up_period.attr && !sdev->host->hostt->change_queue_depth) return 0; return attr->mode; } static umode_t scsi_sdev_bin_attr_is_visible(struct kobject *kobj, const struct bin_attribute *attr, int i) { struct device *dev = kobj_to_dev(kobj); struct scsi_device *sdev = to_scsi_device(dev); if (attr == &dev_attr_vpd_pg0 && !sdev->vpd_pg0) return 0; if (attr == &dev_attr_vpd_pg80 && !sdev->vpd_pg80) return 0; if (attr == &dev_attr_vpd_pg83 && !sdev->vpd_pg83) return 0; if (attr == &dev_attr_vpd_pg89 && !sdev->vpd_pg89) return 0; if (attr == &dev_attr_vpd_pgb0 && !sdev->vpd_pgb0) return 0; if (attr == &dev_attr_vpd_pgb1 && !sdev->vpd_pgb1) return 0; if (attr == &dev_attr_vpd_pgb2 && !sdev->vpd_pgb2) return 0; if (attr == &dev_attr_vpd_pgb7 && !sdev->vpd_pgb7) return 0; return S_IRUGO; } /* Default template for device attributes. May NOT be modified */ static struct attribute *scsi_sdev_attrs[] = { &dev_attr_device_blocked.attr, &dev_attr_type.attr, &dev_attr_scsi_level.attr, &dev_attr_device_busy.attr, &dev_attr_vendor.attr, &dev_attr_model.attr, &dev_attr_serial.attr, &dev_attr_rev.attr, &dev_attr_rescan.attr, &dev_attr_delete.attr, &dev_attr_state.attr, &dev_attr_timeout.attr, &dev_attr_eh_timeout.attr, &dev_attr_iocounterbits.attr, &dev_attr_iorequest_cnt.attr, &dev_attr_iodone_cnt.attr, &dev_attr_ioerr_cnt.attr, &dev_attr_iotmo_cnt.attr, &dev_attr_modalias.attr, &dev_attr_queue_depth.attr, &dev_attr_queue_type.attr, &dev_attr_wwid.attr, &dev_attr_blacklist.attr, #ifdef CONFIG_SCSI_DH &dev_attr_dh_state.attr, &dev_attr_access_state.attr, &dev_attr_preferred_path.attr, #endif &dev_attr_queue_ramp_up_period.attr, &dev_attr_cdl_supported.attr, &dev_attr_cdl_enable.attr, REF_EVT(media_change), REF_EVT(inquiry_change_reported), REF_EVT(capacity_change_reported), REF_EVT(soft_threshold_reached), REF_EVT(mode_parameter_change_reported), REF_EVT(lun_change_reported), NULL }; static const struct bin_attribute *const scsi_sdev_bin_attrs[] = { &dev_attr_vpd_pg0, &dev_attr_vpd_pg83, &dev_attr_vpd_pg80, &dev_attr_vpd_pg89, &dev_attr_vpd_pgb0, &dev_attr_vpd_pgb1, &dev_attr_vpd_pgb2, &dev_attr_vpd_pgb7, &dev_attr_inquiry, NULL }; static struct attribute_group scsi_sdev_attr_group = { .attrs = scsi_sdev_attrs, .bin_attrs = scsi_sdev_bin_attrs, .is_visible = scsi_sdev_attr_is_visible, .is_bin_visible = scsi_sdev_bin_attr_is_visible, }; static const struct attribute_group *scsi_sdev_attr_groups[] = { &scsi_sdev_attr_group, NULL }; static int scsi_target_add(struct scsi_target *starget) { int error; if (starget->state != STARGET_CREATED) return 0; error = device_add(&starget->dev); if (error) { dev_err(&starget->dev, "target device_add failed, error %d\n", error); return error; } transport_add_device(&starget->dev); starget->state = STARGET_RUNNING; pm_runtime_set_active(&starget->dev); pm_runtime_enable(&starget->dev); device_enable_async_suspend(&starget->dev); return 0; } /** * scsi_sysfs_add_sdev - add scsi device to sysfs * @sdev: scsi_device to add * * Return value: * 0 on Success / non-zero on Failure **/ int scsi_sysfs_add_sdev(struct scsi_device *sdev) { int error; struct scsi_target *starget = sdev->sdev_target; if (WARN_ON_ONCE(scsi_device_is_pseudo_dev(sdev))) return -EINVAL; error = scsi_target_add(starget); if (error) return error; transport_configure_device(&starget->dev); device_enable_async_suspend(&sdev->sdev_gendev); scsi_autopm_get_target(starget); pm_runtime_set_active(&sdev->sdev_gendev); if (!sdev->rpm_autosuspend) pm_runtime_forbid(&sdev->sdev_gendev); pm_runtime_enable(&sdev->sdev_gendev); scsi_autopm_put_target(starget); scsi_autopm_get_device(sdev); scsi_dh_add_device(sdev); error = device_add(&sdev->sdev_gendev); if (error) { sdev_printk(KERN_INFO, sdev, "failed to add device: %d\n", error); return error; } device_enable_async_suspend(&sdev->sdev_dev); error = device_add(&sdev->sdev_dev); if (error) { sdev_printk(KERN_INFO, sdev, "failed to add class device: %d\n", error); device_del(&sdev->sdev_gendev); return error; } transport_add_device(&sdev->sdev_gendev); sdev->is_visible = 1; if (IS_ENABLED(CONFIG_BLK_DEV_BSG)) { sdev->bsg_dev = scsi_bsg_register_queue(sdev); if (IS_ERR(sdev->bsg_dev)) { error = PTR_ERR(sdev->bsg_dev); sdev_printk(KERN_INFO, sdev, "Failed to register bsg queue, errno=%d\n", error); sdev->bsg_dev = NULL; } } scsi_autopm_put_device(sdev); return error; } void __scsi_remove_device(struct scsi_device *sdev) { struct device *dev = &sdev->sdev_gendev; int res; /* * This cleanup path is not reentrant and while it is impossible * to get a new reference with scsi_device_get() someone can still * hold a previously acquired one. */ if (sdev->sdev_state == SDEV_DEL) return; if (sdev->is_visible) { /* * If scsi_internal_target_block() is running concurrently, * wait until it has finished before changing the device state. */ mutex_lock(&sdev->state_mutex); /* * If blocked, we go straight to DEL and restart the queue so * any commands issued during driver shutdown (like sync * cache) are errored immediately. */ res = scsi_device_set_state(sdev, SDEV_CANCEL); if (res != 0) { res = scsi_device_set_state(sdev, SDEV_DEL); if (res == 0) scsi_start_queue(sdev); } mutex_unlock(&sdev->state_mutex); if (res != 0) return; if (IS_ENABLED(CONFIG_BLK_DEV_BSG) && sdev->bsg_dev) bsg_unregister_queue(sdev->bsg_dev); device_unregister(&sdev->sdev_dev); transport_remove_device(dev); device_del(dev); } else put_device(&sdev->sdev_dev); /* * Stop accepting new requests and wait until all queuecommand() and * scsi_run_queue() invocations have finished before tearing down the * device. */ mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_DEL); mutex_unlock(&sdev->state_mutex); blk_mq_destroy_queue(sdev->request_queue); kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags); cancel_work_sync(&sdev->requeue_work); if (!scsi_device_is_pseudo_dev(sdev) && sdev->host->hostt->sdev_destroy) sdev->host->hostt->sdev_destroy(sdev); transport_destroy_device(dev); /* * Paired with the kref_get() in scsi_sysfs_initialize(). We have * removed sysfs visibility from the device, so make the target * invisible if this was the last device underneath it. */ scsi_target_reap(scsi_target(sdev)); put_device(dev); } /** * scsi_remove_device - unregister a device from the scsi bus * @sdev: scsi_device to unregister **/ void scsi_remove_device(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; mutex_lock(&shost->scan_mutex); __scsi_remove_device(sdev); mutex_unlock(&shost->scan_mutex); } EXPORT_SYMBOL(scsi_remove_device); static void __scsi_remove_target(struct scsi_target *starget) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; struct scsi_device *sdev; spin_lock_irqsave(shost->host_lock, flags); restart: list_for_each_entry(sdev, &shost->__devices, siblings) { /* * We cannot call scsi_device_get() here, as * we might've been called from rmmod() causing * scsi_device_get() to fail the module_is_live() * check. */ if (sdev->channel != starget->channel || sdev->id != starget->id) continue; if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL || !get_device(&sdev->sdev_gendev)) continue; spin_unlock_irqrestore(shost->host_lock, flags); scsi_remove_device(sdev); put_device(&sdev->sdev_gendev); spin_lock_irqsave(shost->host_lock, flags); goto restart; } spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_remove_target - try to remove a target and all its devices * @dev: generic starget or parent of generic stargets to be removed * * Note: This is slightly racy. It is possible that if the user * requests the addition of another device then the target won't be * removed. */ void scsi_remove_target(struct device *dev) { struct Scsi_Host *shost = dev_to_shost(dev->parent); struct scsi_target *starget; unsigned long flags; restart: spin_lock_irqsave(shost->host_lock, flags); list_for_each_entry(starget, &shost->__targets, siblings) { if (starget->state == STARGET_DEL || starget->state == STARGET_REMOVE || starget->state == STARGET_CREATED_REMOVE) continue; if (starget->dev.parent == dev || &starget->dev == dev) { kref_get(&starget->reap_ref); if (starget->state == STARGET_CREATED) starget->state = STARGET_CREATED_REMOVE; else starget->state = STARGET_REMOVE; spin_unlock_irqrestore(shost->host_lock, flags); __scsi_remove_target(starget); scsi_target_reap(starget); goto restart; } } spin_unlock_irqrestore(shost->host_lock, flags); } EXPORT_SYMBOL(scsi_remove_target); static int scsi_legacy_probe(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct device_driver *driver = dev->driver; return driver->probe(dev); } static void scsi_legacy_remove(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct device_driver *driver = dev->driver; driver->remove(dev); } static void scsi_legacy_shutdown(struct scsi_device *sdp) { struct device *dev = &sdp->sdev_gendev; struct device_driver *driver = dev->driver; driver->shutdown(dev); } int __scsi_register_driver(struct scsi_driver *sdrv, struct module *owner) { struct device_driver *drv = &sdrv->gendrv; drv->bus = &scsi_bus_type; drv->owner = owner; if (!sdrv->probe && drv->probe) sdrv->probe = scsi_legacy_probe; if (!sdrv->remove && drv->remove) sdrv->remove = scsi_legacy_remove; if (!sdrv->shutdown && drv->shutdown) sdrv->shutdown = scsi_legacy_shutdown; return driver_register(drv); } EXPORT_SYMBOL(__scsi_register_driver); int scsi_register_interface(struct class_interface *intf) { intf->class = &sdev_class; return class_interface_register(intf); } EXPORT_SYMBOL(scsi_register_interface); /** * scsi_sysfs_add_host - add scsi host to subsystem * @shost: scsi host struct to add to subsystem **/ int scsi_sysfs_add_host(struct Scsi_Host *shost) { transport_register_device(&shost->shost_gendev); transport_configure_device(&shost->shost_gendev); return 0; } static const struct device_type scsi_dev_type = { .name = "scsi_device", .release = scsi_device_dev_release, .groups = scsi_sdev_attr_groups, }; void scsi_sysfs_device_initialize(struct scsi_device *sdev) { unsigned long flags; struct Scsi_Host *shost = sdev->host; const struct scsi_host_template *hostt = shost->hostt; struct scsi_target *starget = sdev->sdev_target; device_initialize(&sdev->sdev_gendev); sdev->sdev_gendev.bus = &scsi_bus_type; sdev->sdev_gendev.type = &scsi_dev_type; scsi_enable_async_suspend(&sdev->sdev_gendev); dev_set_name(&sdev->sdev_gendev, "%d:%d:%d:%llu", sdev->host->host_no, sdev->channel, sdev->id, sdev->lun); sdev->sdev_gendev.groups = hostt->sdev_groups; device_initialize(&sdev->sdev_dev); sdev->sdev_dev.parent = get_device(&sdev->sdev_gendev); sdev->sdev_dev.class = &sdev_class; dev_set_name(&sdev->sdev_dev, "%d:%d:%d:%llu", sdev->host->host_no, sdev->channel, sdev->id, sdev->lun); /* * Get a default scsi_level from the target (derived from sibling * devices). This is the best we can do for guessing how to set * sdev->lun_in_cdb for the initial INQUIRY command. For LUN 0 the * setting doesn't matter, because all the bits are zero anyway. * But it does matter for higher LUNs. */ sdev->scsi_level = starget->scsi_level; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN && !shost->no_scsi2_lun_in_cdb) sdev->lun_in_cdb = 1; transport_setup_device(&sdev->sdev_gendev); spin_lock_irqsave(shost->host_lock, flags); list_add_tail(&sdev->same_target_siblings, &starget->devices); list_add_tail(&sdev->siblings, &shost->__devices); spin_unlock_irqrestore(shost->host_lock, flags); /* * device can now only be removed via __scsi_remove_device() so hold * the target. Target will be held in CREATED state until something * beneath it becomes visible (in which case it moves to RUNNING) */ kref_get(&starget->reap_ref); } int scsi_is_sdev_device(const struct device *dev) { return dev->type == &scsi_dev_type; } EXPORT_SYMBOL(scsi_is_sdev_device); /* A blank transport template that is used in drivers that don't * yet implement Transport Attributes */ struct scsi_transport_template blank_transport_template = { { { {NULL, }, }, }, };
344 7 198 23574 46 4038 8986 1562 2684 322 11 11 7 6 2 31 375 341 7 4 5 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ #include <linux/cleanup.h> #include <linux/fault-inject-usercopy.h> #include <linux/instrumented.h> #include <linux/minmax.h> #include <linux/nospec.h> #include <linux/sched.h> #include <linux/ucopysize.h> #include <asm/uaccess.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * * Passing down mm_struct allows to define untagging rules on per-process * basis. * * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef untagged_addr_remote #define untagged_addr_remote(mm, addr) ({ \ mmap_assert_locked(mm); \ untagged_addr(addr); \ }) #endif #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 # ifndef masked_user_write_access_begin # define masked_user_write_access_begin masked_user_access_begin # endif # ifndef masked_user_read_access_begin # define masked_user_read_access_begin masked_user_access_begin #endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL #define masked_user_read_access_begin(src) NULL #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and * __copy_{to,from}_user{,_inatomic}(). * * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and * return the amount left to copy. They should assume that access_ok() has * already been checked (and succeeded); they should *not* zero-pad anything. * No KASAN or object size checks either - those belong here. * * Both of these functions should attempt to copy size bytes starting at from * into the area starting at to. They must not fetch or store anything * outside of those areas. Return value must be between 0 (everything * copied successfully) and size (nothing copied). * * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting * at to must become equal to the bytes fetched from the corresponding area * starting at from. All data past to + size - N must be left unmodified. * * If copying succeeds, the return value must be 0. If some data cannot be * fetched, it is permitted to copy less than had been fetched; the only * hard requirement is that not storing anything at all (i.e. returning size) * should happen only when nothing could be copied. In other words, you don't * have to squeeze as much as possible - it is allowed, but not necessary. * * For raw_copy_from_user() to always points to kernel memory and no faults * on store should happen. Interpretation of from is affected by set_fs(). * For raw_copy_to_user() it's the other way round. * * Both can be inlined - it's up to architectures whether it wants to bother * with that. They should not be used directly; they are used to implement * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) * that are used instead. Out of those, __... ones are inlined. Plain * copy_{to,from}_user() might or might not be inlined. If you want them * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. * * NOTE: only copy_from_user() zero-pads the destination in case of short copy. * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything * at all; their callers absolutely must check the return value. * * Biarch ones should also provide raw_copy_in_user() - similar to the above, * but both source and destination are __user pointers (affected by set_fs() * as usual) and both source and destination can trigger faults. */ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { unsigned long res; instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res; might_fault(); instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; check_object_size(to, n, false); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); return res; } /** * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. * @to: Destination address, in user space. * @from: Source address, in kernel space. * @n: Number of bytes to copy. * * Context: User context only. * * Copy data from kernel space to user space. Caller must check * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. */ static __always_inline __must_check unsigned long __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } static __always_inline __must_check unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; instrument_copy_to_user(to, from, n); check_object_size(from, n, true); return raw_copy_to_user(to, from, n); } /* * Architectures that #define INLINE_COPY_TO_USER use this function * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) { unsigned long res = n; might_fault(); if (should_fail_usercopy()) goto fail; if (can_do_masked_user_access()) from = mask_user_address(from); else { if (!access_ok(from, n)) goto fail; /* * Ensure that bad access_ok() speculation will not * lead to nasty side effects *after* the copy is * finished: */ barrier_nospec(); } instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); instrument_copy_from_user_after(to, from, n, res); if (likely(!res)) return 0; fail: memset(to + (n - res), 0, res); return res; } #ifndef INLINE_COPY_FROM_USER extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); #endif static inline __must_check unsigned long _inline_copy_to_user(void __user *to, const void *from, unsigned long n) { might_fault(); if (should_fail_usercopy()) return n; if (access_ok(to, n)) { instrument_copy_to_user(to, from, n); n = raw_copy_to_user(to, from, n); } return n; } #ifndef INLINE_COPY_TO_USER extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); #endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { if (!check_copy_size(to, n, false)) return n; #ifdef INLINE_COPY_FROM_USER return _inline_copy_from_user(to, from, n); #else return _copy_from_user(to, from, n); #endif } static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { if (!check_copy_size(from, n, true)) return n; #ifdef INLINE_COPY_TO_USER return _inline_copy_to_user(to, from, n); #else return _copy_to_user(to, from, n); #endif } #ifndef copy_mc_to_kernel /* * Without arch opt-in this generic copy_mc_to_kernel() will not handle * #MC (or arch equivalent) during source read. */ static inline unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, size_t cnt) { memcpy(dst, src, cnt); return 0; } #endif static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; } static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; } /* * These routines enable/disable the pagefault handler. If disabled, it will * not take any locks and go straight to the fixup table. * * User access methods will not sleep when called from a pagefault_disabled() * environment. */ static inline void pagefault_disable(void) { pagefault_disabled_inc(); /* * make sure to have issued the store before a pagefault * can hit. */ barrier(); } static inline void pagefault_enable(void) { /* * make sure to issue those last loads/stores before enabling * the pagefault handler again. */ barrier(); pagefault_disabled_dec(); } /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ static inline bool pagefault_disabled(void) { return current->pagefault_disabled != 0; } /* * The pagefault handler is in general disabled by pagefault_disable() or * when in irq context (via in_atomic()). * * This function should only be used by the fault handlers. Other users should * stick to pagefault_disabled(). * Please NEVER use preempt_disable() to disable the fault handler. With * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled. * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT. */ #define faulthandler_disabled() (pagefault_disabled() || in_atomic()) DEFINE_LOCK_GUARD_0(pagefault, pagefault_disable(), pagefault_enable()) #ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS /** * probe_subpage_writeable: probe the user range for write faults at sub-page * granularity (e.g. arm64 MTE) * @uaddr: start of address range * @size: size of address range * * Returns 0 on success, the number of bytes not probed on fault. * * It is expected that the caller checked for the write permission of each * page in the range either by put_user() or GUP. The architecture port can * implement a more efficient get_user() probing if the same sub-page faults * are triggered by either a read or a write. */ static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size) { return 0; } #endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */ #ifndef ARCH_HAS_NOCACHE_UACCESS static inline __must_check unsigned long __copy_from_user_inatomic_nocache(void *to, const void __user *from, unsigned long n) { return __copy_from_user_inatomic(to, from, n); } #endif /* ARCH_HAS_NOCACHE_UACCESS */ extern __must_check int check_zeroed_user(const void __user *from, size_t size); /** * copy_struct_from_user: copy a struct from userspace * @dst: Destination address, in kernel space. This buffer must be @ksize * bytes long. * @ksize: Size of @dst struct. * @src: Source address, in userspace. * @usize: (Alleged) size of @src struct. * * Copies a struct from userspace to kernel space, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * @ksize is just sizeof(*dst), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize) * { * int err; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); * if (err) * return err; * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the userspace has passed an old struct to a * newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize) * are to be zero-filled. * * If @usize > @ksize, then the userspace has passed a new struct to an * older kernel. The trailing bytes unknown to the kernel (@usize - @ksize) * are checked to ensure they are zeroed, otherwise -E2BIG is returned. * * Returns (in all cases, some data may have been copied): * * -E2BIG: (@usize > @ksize) and there are non-zero trailing bytes in @src. * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_from_user(void *dst, size_t ksize, const void __user *src, size_t usize) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize < ksize) { memset(dst + size, 0, rest); } else if (usize > ksize) { int ret = check_zeroed_user(src + size, rest); if (ret <= 0) return ret ?: -E2BIG; } /* Copy the interoperable parts of the struct. */ if (copy_from_user(dst, src, size)) return -EFAULT; return 0; } /** * copy_struct_to_user: copy a struct to userspace * @dst: Destination address, in userspace. This buffer must be @ksize * bytes long. * @usize: (Alleged) size of @dst struct. * @src: Source address, in kernel space. * @ksize: Size of @src struct. * @ignored_trailing: Set to %true if there was a non-zero byte in @src that * userspace cannot see because they are using an smaller struct. * * Copies a struct from kernel space to userspace, in a way that guarantees * backwards-compatibility for struct syscall arguments (as long as future * struct extensions are made such that all new fields are *appended* to the * old struct, and zeroed-out new fields have the same meaning as the old * struct). * * Some syscalls may wish to make sure that userspace knows about everything in * the struct, and if there is a non-zero value that userspce doesn't know * about, they want to return an error (such as -EMSGSIZE) or have some other * fallback (such as adding a "you're missing some information" flag). If * @ignored_trailing is non-%NULL, it will be set to %true if there was a * non-zero byte that could not be copied to userspace (ie. was past @usize). * * While unconditionally returning an error in this case is the simplest * solution, for maximum backward compatibility you should try to only return * -EMSGSIZE if the user explicitly requested the data that couldn't be copied. * Note that structure sizes can change due to header changes and simple * recompilations without code changes(!), so if you care about * @ignored_trailing you probably want to make sure that any new field data is * associated with a flag. Otherwise you might assume that a program knows * about data it does not. * * @ksize is just sizeof(*src), and @usize should've been passed by userspace. * The recommended usage is something like the following: * * SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize) * { * int err; * bool ignored_trailing; * struct foo karg = {}; * * if (usize > PAGE_SIZE) * return -E2BIG; * if (usize < FOO_SIZE_VER0) * return -EINVAL; * * // ... modify karg somehow ... * * err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg), * &ignored_trailing); * if (err) * return err; * if (ignored_trailing) * return -EMSGSIZE: * * // ... * } * * There are three cases to consider: * * If @usize == @ksize, then it's copied verbatim. * * If @usize < @ksize, then the kernel is trying to pass userspace a newer * struct than it supports. Thus we only copy the interoperable portions * (@usize) and ignore the rest (but @ignored_trailing is set to %true if * any of the trailing (@ksize - @usize) bytes are non-zero). * * If @usize > @ksize, then the kernel is trying to pass userspace an older * struct than userspace supports. In order to make sure the * unknown-to-the-kernel fields don't contain garbage values, we zero the * trailing (@usize - @ksize) bytes. * * Returns (in all cases, some data may have been copied): * * -EFAULT: access to userspace failed. */ static __always_inline __must_check int copy_struct_to_user(void __user *dst, size_t usize, const void *src, size_t ksize, bool *ignored_trailing) { size_t size = min(ksize, usize); size_t rest = max(ksize, usize) - size; /* Double check if ksize is larger than a known object size. */ if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1))) return -E2BIG; /* Deal with trailing bytes. */ if (usize > ksize) { if (clear_user(dst + size, rest)) return -EFAULT; } if (ignored_trailing) *ignored_trailing = ksize < usize && memchr_inv(src + size, 0, rest) != NULL; /* Copy the interoperable parts of the struct. */ if (copy_to_user(dst, src, size)) return -EFAULT; return 0; } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size); long copy_from_kernel_nofault(void *dst, const void *src, size_t size); long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size); long copy_from_user_nofault(void *dst, const void __user *src, size_t size); long notrace copy_to_user_nofault(void __user *dst, const void *src, size_t size); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count); long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); #ifdef arch_get_kernel_nofault /* * Wrap the architecture implementation so that @label can be outside of a * cleanup() scope. A regular C goto works correctly, but ASM goto does * not. Clang rejects such an attempt, but GCC silently emits buggy code. */ #define __get_kernel_nofault(dst, src, type, label) \ do { \ __label__ local_label; \ arch_get_kernel_nofault(dst, src, type, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ __label__ local_label; \ arch_put_kernel_nofault(dst, src, type, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ type data; \ if (__get_user(data, p)) \ goto label; \ *(type *)dst = data; \ } while (0) #define __put_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(dst); \ type data = *(type *)src; \ if (__put_user(data, p)) \ goto label; \ } while (0) #endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location * @val: read into this variable * @ptr: address to read from * * Returns 0 on success, or -EFAULT. */ #define get_kernel_nofault(val, ptr) ({ \ const typeof(val) *__gk_ptr = (ptr); \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) #ifdef user_access_begin #ifdef arch_unsafe_get_user /* * Wrap the architecture implementation so that @label can be outside of a * cleanup() scope. A regular C goto works correctly, but ASM goto does * not. Clang rejects such an attempt, but GCC silently emits buggy code. * * Some architectures use internal local labels already, but this extra * indirection here is harmless because the compiler optimizes it out * completely in any case. This construct just ensures that the ASM GOTO * target is always in the local scope. The C goto 'label' works correctly * when leaving a cleanup() scope. */ #define unsafe_get_user(x, ptr, label) \ do { \ __label__ local_label; \ arch_unsafe_get_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #define unsafe_put_user(x, ptr, label) \ do { \ __label__ local_label; \ arch_unsafe_put_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ } \ } while (0) #endif /* arch_unsafe_get_user */ #else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) #define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e) #define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e) #define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e) #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } #endif /* !user_access_begin */ #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end #endif #ifndef user_read_access_begin #define user_read_access_begin user_access_begin #define user_read_access_end user_access_end #endif /* Define RW variant so the below _mode macro expansion works */ #define masked_user_rw_access_begin(u) masked_user_access_begin(u) #define user_rw_access_begin(u, s) user_access_begin(u, s) /* Scoped user access */ /* Cleanup wrapper functions */ static __always_inline void __scoped_user_read_access_end(const void *p) { user_read_access_end(); }; static __always_inline void __scoped_user_write_access_end(const void *p) { user_write_access_end(); }; static __always_inline void __scoped_user_rw_access_end(const void *p) { user_access_end(); }; /** * __scoped_user_access_begin - Start a scoped user access * @mode: The mode of the access class (read, write, rw) * @uptr: The pointer to access user space memory * @size: Size of the access * @elbl: Error label to goto when the access region is rejected * * Internal helper for __scoped_user_access(). Don't use directly. */ #define __scoped_user_access_begin(mode, uptr, size, elbl) \ ({ \ typeof(uptr) __retptr; \ \ if (can_do_masked_user_access()) { \ __retptr = masked_user_##mode##_access_begin(uptr); \ } else { \ __retptr = uptr; \ if (!user_##mode##_access_begin(uptr, size)) \ goto elbl; \ } \ __retptr; \ }) /** * __scoped_user_access - Open a scope for user access * @mode: The mode of the access class (read, write, rw) * @uptr: The pointer to access user space memory * @size: Size of the access * @elbl: Error label to goto when the access region is rejected. It * must be placed outside the scope * * If the user access function inside the scope requires a fault label, it * can use @elbl or a different label outside the scope, which requires * that user access which is implemented with ASM GOTO has been properly * wrapped. See unsafe_get_user() for reference. * * scoped_user_rw_access(ptr, efault) { * unsafe_get_user(rval, &ptr->rval, efault); * unsafe_put_user(wval, &ptr->wval, efault); * } * return 0; * efault: * return -EFAULT; * * The scope is internally implemented as a autoterminating nested for() * loop, which can be left with 'return', 'break' and 'goto' at any * point. * * When the scope is left user_##@_mode##_access_end() is automatically * invoked. * * When the architecture supports masked user access and the access region * which is determined by @uptr and @size is not a valid user space * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user * space address and does not terminate early. This optimizes for the good * case and lets the performance uncritical bad case go through the fault. * * The eventual modification of the pointer is limited to the scope. * Outside of the scope the original pointer value is unmodified, so that * the original pointer value is available for diagnostic purposes in an * out of scope fault path. * * Nesting scoped user access into a user access scope is invalid and fails * the build. Nesting into other guards, e.g. pagefault is safe. * * The masked variant does not check the size of the access and relies on a * mapping hole (e.g. guard page) to catch an out of range pointer, the * first access to user memory inside the scope has to be within * @uptr ... @uptr + PAGE_SIZE - 1 * * Don't use directly. Use scoped_masked_user_$MODE_access() instead. */ #define __scoped_user_access(mode, uptr, size, elbl) \ for (bool done = false; !done; done = true) \ for (auto _tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ !done; done = true) \ /* Force modified pointer usage within the scope */ \ for (const auto uptr __cleanup(__scoped_user_##mode##_access_end) = \ _tmpptr; !done; done = true) /** * scoped_user_read_access_size - Start a scoped user read access with given size * @usrc: Pointer to the user space address to read from * @size: Size of the access starting from @usrc * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_read_access_size(usrc, size, elbl) \ __scoped_user_access(read, usrc, size, elbl) /** * scoped_user_read_access - Start a scoped user read access * @usrc: Pointer to the user space address to read from * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @usrc is determined via sizeof(*@usrc)). * * For further information see __scoped_user_access() above. */ #define scoped_user_read_access(usrc, elbl) \ scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) /** * scoped_user_write_access_size - Start a scoped user write access with given size * @udst: Pointer to the user space address to write to * @size: Size of the access starting from @udst * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_write_access_size(udst, size, elbl) \ __scoped_user_access(write, udst, size, elbl) /** * scoped_user_write_access - Start a scoped user write access * @udst: Pointer to the user space address to write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @udst is determined via sizeof(*@udst)). * * For further information see __scoped_user_access() above. */ #define scoped_user_write_access(udst, elbl) \ scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) /** * scoped_user_rw_access_size - Start a scoped user read/write access with given size * @uptr: Pointer to the user space address to read from and write to * @size: Size of the access starting from @uptr * @elbl: Error label to goto when the access region is rejected * * For further information see __scoped_user_access() above. */ #define scoped_user_rw_access_size(uptr, size, elbl) \ __scoped_user_access(rw, uptr, size, elbl) /** * scoped_user_rw_access - Start a scoped user read/write access * @uptr: Pointer to the user space address to read from and write to * @elbl: Error label to goto when the access region is rejected * * The size of the access starting from @uptr is determined via sizeof(*@uptr)). * * For further information see __scoped_user_access() above. */ #define scoped_user_rw_access(uptr, elbl) \ scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) /** * get_user_inline - Read user data inlined * @val: The variable to store the value read from user memory * @usrc: Pointer to the user space memory to read from * * Return: 0 if successful, -EFAULT when faulted * * Inlined variant of get_user(). Only use when there is a demonstrable * performance reason. */ #define get_user_inline(val, usrc) \ ({ \ __label__ efault; \ typeof(usrc) _tmpsrc = usrc; \ int _ret = 0; \ \ scoped_user_read_access(_tmpsrc, efault) \ unsafe_get_user(val, _tmpsrc, efault); \ if (0) { \ efault: \ _ret = -EFAULT; \ } \ _ret; \ }) /** * put_user_inline - Write to user memory inlined * @val: The value to write * @udst: Pointer to the user space memory to write to * * Return: 0 if successful, -EFAULT when faulted * * Inlined variant of put_user(). Only use when there is a demonstrable * performance reason. */ #define put_user_inline(val, udst) \ ({ \ __label__ efault; \ typeof(udst) _tmpdst = udst; \ int _ret = 0; \ \ scoped_user_write_access(_tmpdst, efault) \ unsafe_put_user(val, _tmpdst, efault); \ if (0) { \ efault: \ _ret = -EFAULT; \ } \ _ret; \ }) #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, unsigned long len); #endif #endif /* __LINUX_UACCESS_H__ */
3 3 2 3 3 2 2 2 2 1 2 2 2 3 3 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 // SPDX-License-Identifier: GPL-2.0-or-later /* * E3C EC168 DVB USB driver * * Copyright (C) 2009 Antti Palosaari <crope@iki.fi> */ #include "ec168.h" #include "ec100.h" #include "mxl5005s.h" DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int ec168_ctrl_msg(struct dvb_usb_device *d, struct ec168_req *req) { int ret; unsigned int pipe; u8 request, requesttype; u8 *buf; switch (req->cmd) { case DOWNLOAD_FIRMWARE: case GPIO: case WRITE_I2C: case STREAMING_CTRL: requesttype = (USB_TYPE_VENDOR | USB_DIR_OUT); request = req->cmd; break; case READ_I2C: requesttype = (USB_TYPE_VENDOR | USB_DIR_IN); request = req->cmd; break; case GET_CONFIG: requesttype = (USB_TYPE_VENDOR | USB_DIR_IN); request = CONFIG; break; case SET_CONFIG: requesttype = (USB_TYPE_VENDOR | USB_DIR_OUT); request = CONFIG; break; case WRITE_DEMOD: requesttype = (USB_TYPE_VENDOR | USB_DIR_OUT); request = DEMOD_RW; break; case READ_DEMOD: requesttype = (USB_TYPE_VENDOR | USB_DIR_IN); request = DEMOD_RW; break; default: dev_err(&d->udev->dev, "%s: unknown command=%02x\n", KBUILD_MODNAME, req->cmd); ret = -EINVAL; goto error; } buf = kmalloc(req->size, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto error; } if (requesttype == (USB_TYPE_VENDOR | USB_DIR_OUT)) { /* write */ memcpy(buf, req->data, req->size); pipe = usb_sndctrlpipe(d->udev, 0); } else { /* read */ pipe = usb_rcvctrlpipe(d->udev, 0); } msleep(1); /* avoid I2C errors */ ret = usb_control_msg(d->udev, pipe, request, requesttype, req->value, req->index, buf, req->size, EC168_USB_TIMEOUT); dvb_usb_dbg_usb_control_msg(d->udev, request, requesttype, req->value, req->index, buf, req->size); if (ret < 0) goto err_dealloc; else ret = 0; /* read request, copy returned data to return buf */ if (!ret && requesttype == (USB_TYPE_VENDOR | USB_DIR_IN)) memcpy(req->data, buf, req->size); kfree(buf); return ret; err_dealloc: kfree(buf); error: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } /* I2C */ static struct ec100_config ec168_ec100_config; static int ec168_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct ec168_req req; int i = 0; int ret; if (num > 2) return -EINVAL; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; while (i < num) { if (num > i + 1 && (msg[i+1].flags & I2C_M_RD)) { if (msg[i].addr == ec168_ec100_config.demod_address) { if (msg[i].len < 1) { i = -EOPNOTSUPP; break; } req.cmd = READ_DEMOD; req.value = 0; req.index = 0xff00 + msg[i].buf[0]; /* reg */ req.size = msg[i+1].len; /* bytes to read */ req.data = &msg[i+1].buf[0]; ret = ec168_ctrl_msg(d, &req); i += 2; } else { dev_err(&d->udev->dev, "%s: I2C read not " \ "implemented\n", KBUILD_MODNAME); ret = -EOPNOTSUPP; i += 2; } } else { if (msg[i].addr == ec168_ec100_config.demod_address) { if (msg[i].len < 1) { i = -EOPNOTSUPP; break; } req.cmd = WRITE_DEMOD; req.value = msg[i].buf[1]; /* val */ req.index = 0xff00 + msg[i].buf[0]; /* reg */ req.size = 0; req.data = NULL; ret = ec168_ctrl_msg(d, &req); i += 1; } else { if (msg[i].len < 1) { i = -EOPNOTSUPP; break; } req.cmd = WRITE_I2C; req.value = msg[i].buf[0]; /* val */ req.index = 0x0100 + msg[i].addr; /* I2C addr */ req.size = msg[i].len-1; req.data = &msg[i].buf[1]; ret = ec168_ctrl_msg(d, &req); i += 1; } } if (ret) goto error; } ret = i; error: mutex_unlock(&d->i2c_mutex); return ret; } static u32 ec168_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static const struct i2c_algorithm ec168_i2c_algo = { .master_xfer = ec168_i2c_xfer, .functionality = ec168_i2c_func, }; /* Callbacks for DVB USB */ static int ec168_identify_state(struct dvb_usb_device *d, const char **name) { int ret; u8 reply; struct ec168_req req = {GET_CONFIG, 0, 1, sizeof(reply), &reply}; dev_dbg(&d->udev->dev, "%s:\n", __func__); ret = ec168_ctrl_msg(d, &req); if (ret) goto error; dev_dbg(&d->udev->dev, "%s: reply=%02x\n", __func__, reply); if (reply == 0x01) ret = WARM; else ret = COLD; return ret; error: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static int ec168_download_firmware(struct dvb_usb_device *d, const struct firmware *fw) { int ret, len, remaining; struct ec168_req req = {DOWNLOAD_FIRMWARE, 0, 0, 0, NULL}; dev_dbg(&d->udev->dev, "%s:\n", __func__); #define LEN_MAX 2048 /* max packet size */ for (remaining = fw->size; remaining > 0; remaining -= LEN_MAX) { len = remaining; if (len > LEN_MAX) len = LEN_MAX; req.size = len; req.data = (u8 *) &fw->data[fw->size - remaining]; req.index = fw->size - remaining; ret = ec168_ctrl_msg(d, &req); if (ret) { dev_err(&d->udev->dev, "%s: firmware download failed=%d\n", KBUILD_MODNAME, ret); goto error; } } req.size = 0; /* set "warm"? */ req.cmd = SET_CONFIG; req.value = 0; req.index = 0x0001; ret = ec168_ctrl_msg(d, &req); if (ret) goto error; /* really needed - no idea what does */ req.cmd = GPIO; req.value = 0; req.index = 0x0206; ret = ec168_ctrl_msg(d, &req); if (ret) goto error; /* activate tuner I2C? */ req.cmd = WRITE_I2C; req.value = 0; req.index = 0x00c6; ret = ec168_ctrl_msg(d, &req); if (ret) goto error; return ret; error: dev_dbg(&d->udev->dev, "%s: failed=%d\n", __func__, ret); return ret; } static struct ec100_config ec168_ec100_config = { .demod_address = 0xff, /* not real address, demod is integrated */ }; static int ec168_ec100_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s:\n", __func__); adap->fe[0] = dvb_attach(ec100_attach, &ec168_ec100_config, &d->i2c_adap); if (adap->fe[0] == NULL) return -ENODEV; return 0; } static struct mxl5005s_config ec168_mxl5003s_config = { .i2c_address = 0xc6, .if_freq = IF_FREQ_4570000HZ, .xtal_freq = CRYSTAL_FREQ_16000000HZ, .agc_mode = MXL_SINGLE_AGC, .tracking_filter = MXL_TF_OFF, .rssi_enable = MXL_RSSI_ENABLE, .cap_select = MXL_CAP_SEL_ENABLE, .div_out = MXL_DIV_OUT_4, .clock_out = MXL_CLOCK_OUT_DISABLE, .output_load = MXL5005S_IF_OUTPUT_LOAD_200_OHM, .top = MXL5005S_TOP_25P2, .mod_mode = MXL_DIGITAL_MODE, .if_mode = MXL_ZERO_IF, .AgcMasterByte = 0x00, }; static int ec168_mxl5003s_tuner_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap_to_d(adap); dev_dbg(&d->udev->dev, "%s:\n", __func__); return dvb_attach(mxl5005s_attach, adap->fe[0], &d->i2c_adap, &ec168_mxl5003s_config) == NULL ? -ENODEV : 0; } static int ec168_streaming_ctrl(struct dvb_frontend *fe, int onoff) { struct dvb_usb_device *d = fe_to_d(fe); struct ec168_req req = {STREAMING_CTRL, 0x7f01, 0x0202, 0, NULL}; dev_dbg(&d->udev->dev, "%s: onoff=%d\n", __func__, onoff); if (onoff) req.index = 0x0102; return ec168_ctrl_msg(d, &req); } /* DVB USB Driver stuff */ /* bInterfaceNumber 0 is HID * bInterfaceNumber 1 is DVB-T */ static const struct dvb_usb_device_properties ec168_props = { .driver_name = KBUILD_MODNAME, .owner = THIS_MODULE, .adapter_nr = adapter_nr, .bInterfaceNumber = 1, .identify_state = ec168_identify_state, .firmware = EC168_FIRMWARE, .download_firmware = ec168_download_firmware, .i2c_algo = &ec168_i2c_algo, .frontend_attach = ec168_ec100_frontend_attach, .tuner_attach = ec168_mxl5003s_tuner_attach, .streaming_ctrl = ec168_streaming_ctrl, .num_adapters = 1, .adapter = { { .stream = DVB_USB_STREAM_BULK(0x82, 6, 32 * 512), } }, }; static const struct usb_device_id ec168_id[] = { { DVB_USB_DEVICE(USB_VID_E3C, USB_PID_E3C_EC168, &ec168_props, "E3C EC168 reference design", NULL)}, { DVB_USB_DEVICE(USB_VID_E3C, USB_PID_E3C_EC168_2, &ec168_props, "E3C EC168 reference design", NULL)}, { DVB_USB_DEVICE(USB_VID_E3C, USB_PID_E3C_EC168_3, &ec168_props, "E3C EC168 reference design", NULL)}, { DVB_USB_DEVICE(USB_VID_E3C, USB_PID_E3C_EC168_4, &ec168_props, "E3C EC168 reference design", NULL)}, { DVB_USB_DEVICE(USB_VID_E3C, USB_PID_E3C_EC168_5, &ec168_props, "E3C EC168 reference design", NULL)}, {} }; MODULE_DEVICE_TABLE(usb, ec168_id); static struct usb_driver ec168_driver = { .name = KBUILD_MODNAME, .id_table = ec168_id, .probe = dvb_usbv2_probe, .disconnect = dvb_usbv2_disconnect, .suspend = dvb_usbv2_suspend, .resume = dvb_usbv2_resume, .no_dynamic_id = 1, .soft_unbind = 1, }; module_usb_driver(ec168_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("E3C EC168 driver"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(EC168_FIRMWARE);
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for N-Trig touchscreens * * Copyright (c) 2008-2010 Rafi Rubin * Copyright (c) 2009-2010 Stephane Chatty */ /* */ #include <linux/device.h> #include <linux/hid.h> #include <linux/usb.h> #include "usbhid/usbhid.h" #include <linux/module.h> #include <linux/slab.h> #include "hid-ids.h" #define NTRIG_DUPLICATE_USAGES 0x001 static unsigned int min_width; module_param(min_width, uint, 0644); MODULE_PARM_DESC(min_width, "Minimum touch contact width to accept."); static unsigned int min_height; module_param(min_height, uint, 0644); MODULE_PARM_DESC(min_height, "Minimum touch contact height to accept."); static unsigned int activate_slack = 1; module_param(activate_slack, uint, 0644); MODULE_PARM_DESC(activate_slack, "Number of touch frames to ignore at " "the start of touch input."); static unsigned int deactivate_slack = 4; module_param(deactivate_slack, uint, 0644); MODULE_PARM_DESC(deactivate_slack, "Number of empty frames to ignore before " "deactivating touch."); static unsigned int activation_width = 64; module_param(activation_width, uint, 0644); MODULE_PARM_DESC(activation_width, "Width threshold to immediately start " "processing touch events."); static unsigned int activation_height = 32; module_param(activation_height, uint, 0644); MODULE_PARM_DESC(activation_height, "Height threshold to immediately start " "processing touch events."); struct ntrig_data { /* Incoming raw values for a single contact */ __u16 x, y, w, h; __u16 id; bool tipswitch; bool confidence; bool first_contact_touch; bool reading_mt; __u8 mt_footer[4]; __u8 mt_foot_count; /* The current activation state. */ __s8 act_state; /* Empty frames to ignore before recognizing the end of activity */ __s8 deactivate_slack; /* Frames to ignore before acknowledging the start of activity */ __s8 activate_slack; /* Minimum size contact to accept */ __u16 min_width; __u16 min_height; /* Threshold to override activation slack */ __u16 activation_width; __u16 activation_height; __u16 sensor_logical_width; __u16 sensor_logical_height; __u16 sensor_physical_width; __u16 sensor_physical_height; }; /* * This function converts the 4 byte raw firmware code into * a string containing 5 comma separated numbers. */ static int ntrig_version_string(unsigned char *raw, char *buf) { __u8 a = (raw[1] & 0x0e) >> 1; __u8 b = (raw[0] & 0x3c) >> 2; __u8 c = ((raw[0] & 0x03) << 3) | ((raw[3] & 0xe0) >> 5); __u8 d = ((raw[3] & 0x07) << 3) | ((raw[2] & 0xe0) >> 5); __u8 e = raw[2] & 0x07; /* * As yet unmapped bits: * 0b11000000 0b11110001 0b00011000 0b00011000 */ return sprintf(buf, "%u.%u.%u.%u.%u", a, b, c, d, e); } static inline int ntrig_get_mode(struct hid_device *hdev) { struct hid_report *report = hdev->report_enum[HID_FEATURE_REPORT]. report_id_hash[0x0d]; if (!report || report->maxfield < 1 || report->field[0]->report_count < 1) return -EINVAL; hid_hw_request(hdev, report, HID_REQ_GET_REPORT); hid_hw_wait(hdev); return (int)report->field[0]->value[0]; } static inline void ntrig_set_mode(struct hid_device *hdev, const int mode) { struct hid_report *report; __u8 mode_commands[4] = { 0xe, 0xf, 0x1b, 0x10 }; if (mode < 0 || mode > 3) return; report = hdev->report_enum[HID_FEATURE_REPORT]. report_id_hash[mode_commands[mode]]; if (!report) return; hid_hw_request(hdev, report, HID_REQ_GET_REPORT); } static void ntrig_report_version(struct hid_device *hdev) { int ret; char buf[20]; struct usb_device *usb_dev = hid_to_usb_dev(hdev); unsigned char *data __free(kfree) = kmalloc(8, GFP_KERNEL); if (!hid_is_usb(hdev)) return; if (!data) return; ret = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0), USB_REQ_CLEAR_FEATURE, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, 0x30c, 1, data, 8, USB_CTRL_SET_TIMEOUT); if (ret == 8) { ret = ntrig_version_string(&data[2], buf); hid_info(hdev, "Firmware version: %s (%02x%02x %02x%02x)\n", buf, data[2], data[3], data[4], data[5]); } } static ssize_t show_phys_width(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->sensor_physical_width); } static DEVICE_ATTR(sensor_physical_width, S_IRUGO, show_phys_width, NULL); static ssize_t show_phys_height(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->sensor_physical_height); } static DEVICE_ATTR(sensor_physical_height, S_IRUGO, show_phys_height, NULL); static ssize_t show_log_width(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->sensor_logical_width); } static DEVICE_ATTR(sensor_logical_width, S_IRUGO, show_log_width, NULL); static ssize_t show_log_height(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->sensor_logical_height); } static DEVICE_ATTR(sensor_logical_height, S_IRUGO, show_log_height, NULL); static ssize_t show_min_width(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->min_width * nd->sensor_physical_width / nd->sensor_logical_width); } static ssize_t set_min_width(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; if (val > nd->sensor_physical_width) return -EINVAL; nd->min_width = val * nd->sensor_logical_width / nd->sensor_physical_width; return count; } static DEVICE_ATTR(min_width, S_IWUSR | S_IRUGO, show_min_width, set_min_width); static ssize_t show_min_height(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->min_height * nd->sensor_physical_height / nd->sensor_logical_height); } static ssize_t set_min_height(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; if (val > nd->sensor_physical_height) return -EINVAL; nd->min_height = val * nd->sensor_logical_height / nd->sensor_physical_height; return count; } static DEVICE_ATTR(min_height, S_IWUSR | S_IRUGO, show_min_height, set_min_height); static ssize_t show_activate_slack(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->activate_slack); } static ssize_t set_activate_slack(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; if (val > 0x7f) return -EINVAL; nd->activate_slack = val; return count; } static DEVICE_ATTR(activate_slack, S_IWUSR | S_IRUGO, show_activate_slack, set_activate_slack); static ssize_t show_activation_width(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->activation_width * nd->sensor_physical_width / nd->sensor_logical_width); } static ssize_t set_activation_width(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; if (val > nd->sensor_physical_width) return -EINVAL; nd->activation_width = val * nd->sensor_logical_width / nd->sensor_physical_width; return count; } static DEVICE_ATTR(activation_width, S_IWUSR | S_IRUGO, show_activation_width, set_activation_width); static ssize_t show_activation_height(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", nd->activation_height * nd->sensor_physical_height / nd->sensor_logical_height); } static ssize_t set_activation_height(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; if (val > nd->sensor_physical_height) return -EINVAL; nd->activation_height = val * nd->sensor_logical_height / nd->sensor_physical_height; return count; } static DEVICE_ATTR(activation_height, S_IWUSR | S_IRUGO, show_activation_height, set_activation_height); static ssize_t show_deactivate_slack(struct device *dev, struct device_attribute *attr, char *buf) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); return sprintf(buf, "%d\n", -nd->deactivate_slack); } static ssize_t set_deactivate_slack(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hid_device *hdev = to_hid_device(dev); struct ntrig_data *nd = hid_get_drvdata(hdev); unsigned long val; if (kstrtoul(buf, 0, &val)) return -EINVAL; /* * No more than 8 terminal frames have been observed so far * and higher slack is highly likely to leave the single * touch emulation stuck down. */ if (val > 7) return -EINVAL; nd->deactivate_slack = -val; return count; } static DEVICE_ATTR(deactivate_slack, S_IWUSR | S_IRUGO, show_deactivate_slack, set_deactivate_slack); static struct attribute *sysfs_attrs[] = { &dev_attr_sensor_physical_width.attr, &dev_attr_sensor_physical_height.attr, &dev_attr_sensor_logical_width.attr, &dev_attr_sensor_logical_height.attr, &dev_attr_min_height.attr, &dev_attr_min_width.attr, &dev_attr_activate_slack.attr, &dev_attr_activation_width.attr, &dev_attr_activation_height.attr, &dev_attr_deactivate_slack.attr, NULL }; static const struct attribute_group ntrig_attribute_group = { .attrs = sysfs_attrs }; /* * this driver is aimed at two firmware versions in circulation: * - dual pen/finger single touch * - finger multitouch, pen not working */ static int ntrig_input_mapping(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { struct ntrig_data *nd = hid_get_drvdata(hdev); /* No special mappings needed for the pen and single touch */ if (field->physical) return 0; switch (usage->hid & HID_USAGE_PAGE) { case HID_UP_GENDESK: switch (usage->hid) { case HID_GD_X: hid_map_usage(hi, usage, bit, max, EV_ABS, ABS_MT_POSITION_X); input_set_abs_params(hi->input, ABS_X, field->logical_minimum, field->logical_maximum, 0, 0); if (!nd->sensor_logical_width) { nd->sensor_logical_width = field->logical_maximum - field->logical_minimum; nd->sensor_physical_width = field->physical_maximum - field->physical_minimum; nd->activation_width = activation_width * nd->sensor_logical_width / nd->sensor_physical_width; nd->min_width = min_width * nd->sensor_logical_width / nd->sensor_physical_width; } return 1; case HID_GD_Y: hid_map_usage(hi, usage, bit, max, EV_ABS, ABS_MT_POSITION_Y); input_set_abs_params(hi->input, ABS_Y, field->logical_minimum, field->logical_maximum, 0, 0); if (!nd->sensor_logical_height) { nd->sensor_logical_height = field->logical_maximum - field->logical_minimum; nd->sensor_physical_height = field->physical_maximum - field->physical_minimum; nd->activation_height = activation_height * nd->sensor_logical_height / nd->sensor_physical_height; nd->min_height = min_height * nd->sensor_logical_height / nd->sensor_physical_height; } return 1; } return 0; case HID_UP_DIGITIZER: switch (usage->hid) { /* we do not want to map these for now */ case HID_DG_CONTACTID: /* Not trustworthy, squelch for now */ case HID_DG_INPUTMODE: case HID_DG_DEVICEINDEX: case HID_DG_CONTACTMAX: return -1; /* width/height mapped on TouchMajor/TouchMinor/Orientation */ case HID_DG_WIDTH: hid_map_usage(hi, usage, bit, max, EV_ABS, ABS_MT_TOUCH_MAJOR); return 1; case HID_DG_HEIGHT: hid_map_usage(hi, usage, bit, max, EV_ABS, ABS_MT_TOUCH_MINOR); input_set_abs_params(hi->input, ABS_MT_ORIENTATION, 0, 1, 0, 0); return 1; } return 0; case 0xff000000: /* we do not want to map these: no input-oriented meaning */ return -1; } return 0; } static int ntrig_input_mapped(struct hid_device *hdev, struct hid_input *hi, struct hid_field *field, struct hid_usage *usage, unsigned long **bit, int *max) { /* No special mappings needed for the pen and single touch */ if (field->physical) return 0; if (usage->type == EV_KEY || usage->type == EV_REL || usage->type == EV_ABS) clear_bit(usage->code, *bit); return 0; } /* * this function is called upon all reports * so that we can filter contact point information, * decide whether we are in multi or single touch mode * and call input_mt_sync after each point if necessary */ static int ntrig_event (struct hid_device *hid, struct hid_field *field, struct hid_usage *usage, __s32 value) { struct ntrig_data *nd = hid_get_drvdata(hid); struct input_dev *input; /* Skip processing if not a claimed input */ if (!(hid->claimed & HID_CLAIMED_INPUT)) goto not_claimed_input; /* This function is being called before the structures are fully * initialized */ if(!(field->hidinput && field->hidinput->input)) return -EINVAL; input = field->hidinput->input; /* No special handling needed for the pen */ if (field->application == HID_DG_PEN) return 0; switch (usage->hid) { case 0xff000001: /* Tag indicating the start of a multitouch group */ nd->reading_mt = true; nd->first_contact_touch = false; break; case HID_DG_TIPSWITCH: nd->tipswitch = value; /* Prevent emission of touch until validated */ return 1; case HID_DG_CONFIDENCE: nd->confidence = value; break; case HID_GD_X: nd->x = value; /* Clear the contact footer */ nd->mt_foot_count = 0; break; case HID_GD_Y: nd->y = value; break; case HID_DG_CONTACTID: nd->id = value; break; case HID_DG_WIDTH: nd->w = value; break; case HID_DG_HEIGHT: nd->h = value; /* * when in single touch mode, this is the last * report received in a finger event. We want * to emit a normal (X, Y) position */ if (!nd->reading_mt) { /* * TipSwitch indicates the presence of a * finger in single touch mode. */ input_report_key(input, BTN_TOUCH, nd->tipswitch); input_report_key(input, BTN_TOOL_DOUBLETAP, nd->tipswitch); input_event(input, EV_ABS, ABS_X, nd->x); input_event(input, EV_ABS, ABS_Y, nd->y); } break; case 0xff000002: /* * we receive this when the device is in multitouch * mode. The first of the three values tagged with * this usage tells if the contact point is real * or a placeholder */ /* Shouldn't get more than 4 footer packets, so skip */ if (nd->mt_foot_count >= 4) break; nd->mt_footer[nd->mt_foot_count++] = value; /* if the footer isn't complete break */ if (nd->mt_foot_count != 4) break; /* Pen activity signal. */ if (nd->mt_footer[2]) { /* * When the pen deactivates touch, we see a * bogus frame with ContactCount > 0. * We can * save a bit of work by ensuring act_state < 0 * even if deactivation slack is turned off. */ nd->act_state = deactivate_slack - 1; nd->confidence = false; break; } /* * The first footer value indicates the presence of a * finger. */ if (nd->mt_footer[0]) { /* * We do not want to process contacts under * the size threshold, but do not want to * ignore them for activation state */ if (nd->w < nd->min_width || nd->h < nd->min_height) nd->confidence = false; } else break; if (nd->act_state > 0) { /* * Contact meets the activation size threshold */ if (nd->w >= nd->activation_width && nd->h >= nd->activation_height) { if (nd->id) /* * first contact, activate now */ nd->act_state = 0; else { /* * avoid corrupting this frame * but ensure next frame will * be active */ nd->act_state = 1; break; } } else /* * Defer adjusting the activation state * until the end of the frame. */ break; } /* Discarding this contact */ if (!nd->confidence) break; /* emit a normal (X, Y) for the first point only */ if (nd->id == 0) { /* * TipSwitch is superfluous in multitouch * mode. The footer events tell us * if there is a finger on the screen or * not. */ nd->first_contact_touch = nd->confidence; input_event(input, EV_ABS, ABS_X, nd->x); input_event(input, EV_ABS, ABS_Y, nd->y); } /* Emit MT events */ input_event(input, EV_ABS, ABS_MT_POSITION_X, nd->x); input_event(input, EV_ABS, ABS_MT_POSITION_Y, nd->y); /* * Translate from height and width to size * and orientation. */ if (nd->w > nd->h) { input_event(input, EV_ABS, ABS_MT_ORIENTATION, 1); input_event(input, EV_ABS, ABS_MT_TOUCH_MAJOR, nd->w); input_event(input, EV_ABS, ABS_MT_TOUCH_MINOR, nd->h); } else { input_event(input, EV_ABS, ABS_MT_ORIENTATION, 0); input_event(input, EV_ABS, ABS_MT_TOUCH_MAJOR, nd->h); input_event(input, EV_ABS, ABS_MT_TOUCH_MINOR, nd->w); } input_mt_sync(field->hidinput->input); break; case HID_DG_CONTACTCOUNT: /* End of a multitouch group */ if (!nd->reading_mt) /* Just to be sure */ break; nd->reading_mt = false; /* * Activation state machine logic: * * Fundamental states: * state > 0: Inactive * state <= 0: Active * state < -deactivate_slack: * Pen termination of touch * * Specific values of interest * state == activate_slack * no valid input since the last reset * * state == 0 * general operational state * * state == -deactivate_slack * read sufficient empty frames to accept * the end of input and reset */ if (nd->act_state > 0) { /* Currently inactive */ if (value) /* * Consider each live contact as * evidence of intentional activity. */ nd->act_state = (nd->act_state > value) ? nd->act_state - value : 0; else /* * Empty frame before we hit the * activity threshold, reset. */ nd->act_state = nd->activate_slack; /* * Entered this block inactive and no * coordinates sent this frame, so hold off * on button state. */ break; } else { /* Currently active */ if (value && nd->act_state >= nd->deactivate_slack) /* * Live point: clear accumulated * deactivation count. */ nd->act_state = 0; else if (nd->act_state <= nd->deactivate_slack) /* * We've consumed the deactivation * slack, time to deactivate and reset. */ nd->act_state = nd->activate_slack; else { /* Move towards deactivation */ nd->act_state--; break; } } if (nd->first_contact_touch && nd->act_state <= 0) { /* * Check to see if we're ready to start * emitting touch events. * * Note: activation slack will decrease over * the course of the frame, and it will be * inconsistent from the start to the end of * the frame. However if the frame starts * with slack, first_contact_touch will still * be 0 and we will not get to this point. */ input_report_key(input, BTN_TOOL_DOUBLETAP, 1); input_report_key(input, BTN_TOUCH, 1); } else { input_report_key(input, BTN_TOOL_DOUBLETAP, 0); input_report_key(input, BTN_TOUCH, 0); } break; default: /* fall-back to the generic hidinput handling */ return 0; } not_claimed_input: /* we have handled the hidinput part, now remains hiddev */ if ((hid->claimed & HID_CLAIMED_HIDDEV) && hid->hiddev_hid_event) hid->hiddev_hid_event(hid, field, usage, value); return 1; } static int ntrig_input_configured(struct hid_device *hid, struct hid_input *hidinput) { struct input_dev *input = hidinput->input; if (hidinput->report->maxfield < 1) return 0; switch (hidinput->report->field[0]->application) { case HID_DG_PEN: input->name = "N-Trig Pen"; break; case HID_DG_TOUCHSCREEN: /* These keys are redundant for fingers, clear them * to prevent incorrect identification */ __clear_bit(BTN_TOOL_PEN, input->keybit); __clear_bit(BTN_TOOL_FINGER, input->keybit); __clear_bit(BTN_0, input->keybit); __set_bit(BTN_TOOL_DOUBLETAP, input->keybit); /* * The physical touchscreen (single touch) * input has a value for physical, whereas * the multitouch only has logical input * fields. */ input->name = (hidinput->report->field[0]->physical) ? "N-Trig Touchscreen" : "N-Trig MultiTouch"; break; } return 0; } static int ntrig_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; struct ntrig_data *nd; struct hid_report *report; if (id->driver_data) hdev->quirks |= HID_QUIRK_MULTI_INPUT | HID_QUIRK_NO_INIT_REPORTS; nd = kmalloc_obj(struct ntrig_data); if (!nd) { hid_err(hdev, "cannot allocate N-Trig data\n"); return -ENOMEM; } nd->reading_mt = false; nd->min_width = 0; nd->min_height = 0; nd->activate_slack = activate_slack; nd->act_state = activate_slack; nd->deactivate_slack = -deactivate_slack; nd->sensor_logical_width = 1; nd->sensor_logical_height = 1; nd->sensor_physical_width = 1; nd->sensor_physical_height = 1; hid_set_drvdata(hdev, nd); ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err_free; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (ret) { hid_err(hdev, "hw start failed\n"); goto err_free; } /* This is needed for devices with more recent firmware versions */ report = hdev->report_enum[HID_FEATURE_REPORT].report_id_hash[0x0a]; if (report) { /* Let the device settle to ensure the wakeup message gets * through */ hid_hw_wait(hdev); hid_hw_request(hdev, report, HID_REQ_GET_REPORT); /* * Sanity check: if the current mode is invalid reset it to * something reasonable. */ if (ntrig_get_mode(hdev) >= 4) ntrig_set_mode(hdev, 3); } ntrig_report_version(hdev); ret = sysfs_create_group(&hdev->dev.kobj, &ntrig_attribute_group); if (ret) hid_err(hdev, "cannot create sysfs group\n"); return 0; err_free: kfree(nd); return ret; } static void ntrig_remove(struct hid_device *hdev) { sysfs_remove_group(&hdev->dev.kobj, &ntrig_attribute_group); hid_hw_stop(hdev); kfree(hid_get_drvdata(hdev)); } static const struct hid_device_id ntrig_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_1), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_2), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_3), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_4), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_5), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_6), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_7), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_8), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_9), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_10), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_11), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_12), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_13), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_14), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_15), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_16), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_17), .driver_data = NTRIG_DUPLICATE_USAGES }, { HID_USB_DEVICE(USB_VENDOR_ID_NTRIG, USB_DEVICE_ID_NTRIG_TOUCH_SCREEN_18), .driver_data = NTRIG_DUPLICATE_USAGES }, { } }; MODULE_DEVICE_TABLE(hid, ntrig_devices); static const struct hid_usage_id ntrig_grabbed_usages[] = { { HID_ANY_ID, HID_ANY_ID, HID_ANY_ID }, { HID_ANY_ID - 1, HID_ANY_ID - 1, HID_ANY_ID - 1 } }; static struct hid_driver ntrig_driver = { .name = "ntrig", .id_table = ntrig_devices, .probe = ntrig_probe, .remove = ntrig_remove, .input_mapping = ntrig_input_mapping, .input_mapped = ntrig_input_mapped, .input_configured = ntrig_input_configured, .usage_table = ntrig_grabbed_usages, .event = ntrig_event, }; module_hid_driver(ntrig_driver); MODULE_DESCRIPTION("HID driver for N-Trig touchscreens"); MODULE_LICENSE("GPL");
541 436 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 // SPDX-License-Identifier: GPL-2.0-only /* * Generic HDLC support routines for Linux * * Copyright (C) 1999 - 2008 Krzysztof Halasa <khc@pm.waw.pl> * * Currently supported: * * raw IP-in-HDLC * * Cisco HDLC * * Frame Relay with ANSI or CCITT LMI (both user and network side) * * PPP * * X.25 * * Use sethdlc utility to set line parameters, protocol and PVCs * * How does it work: * - proto->open(), close(), start(), stop() calls are serialized. * The order is: open, [ start, stop ... ] close ... * - proto->start() and stop() are called with spin_lock_irq held. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/hdlc.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/pkt_sched.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/net_namespace.h> static const char *version = "HDLC support module revision 1.22"; #undef DEBUG_LINK static struct hdlc_proto *first_proto; static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { struct hdlc_device *hdlc; /* First make sure "dev" is an HDLC device */ if (!(dev->priv_flags & IFF_WAN_HDLC)) { kfree_skb(skb); return NET_RX_SUCCESS; } hdlc = dev_to_hdlc(dev); if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(skb); return 0; } BUG_ON(!hdlc->proto->netif_rx); return hdlc->proto->netif_rx(skb); } netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->xmit) return hdlc->proto->xmit(skb, dev); return hdlc->xmit(skb, dev); /* call hardware driver directly */ } EXPORT_SYMBOL(hdlc_start_xmit); static inline void hdlc_proto_start(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->start) hdlc->proto->start(dev); } static inline void hdlc_proto_stop(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->stop) hdlc->proto->stop(dev); } static int hdlc_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); hdlc_device *hdlc; unsigned long flags; int on; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (!(dev->priv_flags & IFF_WAN_HDLC)) return NOTIFY_DONE; /* not an HDLC device */ if (event != NETDEV_CHANGE) return NOTIFY_DONE; /* Only interested in carrier changes */ on = netif_carrier_ok(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_device_event NETDEV_CHANGE, carrier %i\n", dev->name, on); #endif hdlc = dev_to_hdlc(dev); spin_lock_irqsave(&hdlc->state_lock, flags); if (hdlc->carrier == on) goto carrier_exit; /* no change in DCD line level */ hdlc->carrier = on; if (!hdlc->open) goto carrier_exit; if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "Carrier lost\n"); hdlc_proto_stop(dev); } carrier_exit: spin_unlock_irqrestore(&hdlc->state_lock, flags); return NOTIFY_DONE; } /* Must be called by hardware driver when HDLC device is being opened */ int hdlc_open(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_open() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif if (!hdlc->proto) return -ENOSYS; /* no protocol attached */ if (hdlc->proto->open) { int result = hdlc->proto->open(dev); if (result) return result; } spin_lock_irq(&hdlc->state_lock); if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "No carrier\n"); } hdlc->open = 1; spin_unlock_irq(&hdlc->state_lock); return 0; } EXPORT_SYMBOL(hdlc_open); /* Must be called by hardware driver when HDLC device is being closed */ void hdlc_close(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_close() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif spin_lock_irq(&hdlc->state_lock); hdlc->open = 0; if (hdlc->carrier) hdlc_proto_stop(dev); spin_unlock_irq(&hdlc->state_lock); if (hdlc->proto->close) hdlc->proto->close(dev); } EXPORT_SYMBOL(hdlc_close); int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs) { struct hdlc_proto *proto = first_proto; int result; if (dev_to_hdlc(dev)->proto) { result = dev_to_hdlc(dev)->proto->ioctl(dev, ifs); if (result != -EINVAL) return result; } /* Not handled by currently attached protocol (if any) */ while (proto) { result = proto->ioctl(dev, ifs); if (result != -EINVAL) return result; proto = proto->next; } return -EINVAL; } EXPORT_SYMBOL(hdlc_ioctl); static const struct header_ops hdlc_null_ops; static void hdlc_setup_dev(struct net_device *dev) { /* Re-init all variables changed by HDLC protocol drivers, * including ether_setup() called from hdlc_raw_eth.c. */ dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags = IFF_WAN_HDLC; dev->mtu = HDLC_MAX_MTU; dev->min_mtu = 68; dev->max_mtu = HDLC_MAX_MTU; dev->type = ARPHRD_RAWHDLC; dev->hard_header_len = 0; dev->needed_headroom = 0; dev->addr_len = 0; dev->header_ops = &hdlc_null_ops; } static void hdlc_setup(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); hdlc_setup_dev(dev); hdlc->carrier = 1; hdlc->open = 0; spin_lock_init(&hdlc->state_lock); } struct net_device *alloc_hdlcdev(void *priv) { struct net_device *dev; dev = alloc_netdev(sizeof(struct hdlc_device), "hdlc%d", NET_NAME_UNKNOWN, hdlc_setup); if (dev) dev_to_hdlc(dev)->priv = priv; return dev; } EXPORT_SYMBOL(alloc_hdlcdev); void unregister_hdlc_device(struct net_device *dev) { rtnl_lock(); detach_hdlc_protocol(dev); unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_device); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size) { int err; err = detach_hdlc_protocol(dev); if (err) return err; if (!try_module_get(proto->module)) return -ENOSYS; if (size) { dev_to_hdlc(dev)->state = kmalloc(size, GFP_KERNEL); if (!dev_to_hdlc(dev)->state) { module_put(proto->module); return -ENOBUFS; } } dev_to_hdlc(dev)->proto = proto; return 0; } EXPORT_SYMBOL(attach_hdlc_protocol); int detach_hdlc_protocol(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); int err; if (hdlc->proto) { err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } if (hdlc->proto->detach) hdlc->proto->detach(dev); module_put(hdlc->proto->module); hdlc->proto = NULL; } kfree(hdlc->state); hdlc->state = NULL; hdlc_setup_dev(dev); return 0; } EXPORT_SYMBOL(detach_hdlc_protocol); void register_hdlc_protocol(struct hdlc_proto *proto) { rtnl_lock(); proto->next = first_proto; first_proto = proto; rtnl_unlock(); } EXPORT_SYMBOL(register_hdlc_protocol); void unregister_hdlc_protocol(struct hdlc_proto *proto) { struct hdlc_proto **p; rtnl_lock(); p = &first_proto; while (*p != proto) { BUG_ON(!*p); p = &((*p)->next); } *p = proto->next; rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_protocol); MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>"); MODULE_DESCRIPTION("HDLC support module"); MODULE_LICENSE("GPL v2"); static struct packet_type hdlc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_HDLC), .func = hdlc_rcv, }; static struct notifier_block hdlc_notifier = { .notifier_call = hdlc_device_event, }; static int __init hdlc_module_init(void) { int result; pr_info("%s\n", version); result = register_netdevice_notifier(&hdlc_notifier); if (result) return result; dev_add_pack(&hdlc_packet_type); return 0; } static void __exit hdlc_module_exit(void) { dev_remove_pack(&hdlc_packet_type); unregister_netdevice_notifier(&hdlc_notifier); } module_init(hdlc_module_init); module_exit(hdlc_module_exit);
23 12 13 23 5 5 5 46 46 46 10 10 53 54 54 3 3 2 2 42 42 42 44 44 44 8 8 8 8 5 4 1 5 26 14 17 4 1 1 14 13 1641 1638 1644 44 1638 8 44 44 44 8 45 45 45 44 44 44 44 44 44 8 8 8 8 8 45 44 1 8 8 52 8 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 // SPDX-License-Identifier: GPL-2.0 /* * udc.c - Core UDC Framework * * Copyright (C) 2010 Texas Instruments * Author: Felipe Balbi <balbi@ti.com> */ #define pr_fmt(fmt) "UDC core: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/dma-mapping.h> #include <linux/sched/task_stack.h> #include <linux/workqueue.h> #include <linux/usb/ch9.h> #include <linux/usb/gadget.h> #include <linux/usb.h> #include "trace.h" static DEFINE_IDA(gadget_id_numbers); static const struct bus_type gadget_bus_type; /** * struct usb_udc - describes one usb device controller * @driver: the gadget driver pointer. For use by the class code * @dev: the child device to the actual controller * @gadget: the gadget. For use by the class code * @list: for use by the udc class driver * @vbus: for udcs who care about vbus status, this value is real vbus status; * for udcs who do not care about vbus status, this value is always true * @started: the UDC's started state. True if the UDC had started. * @allow_connect: Indicates whether UDC is allowed to be pulled up. * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or * unbound. * @vbus_work: work routine to handle VBUS status change notifications. * @connect_lock: protects udc->started, gadget->connect, * gadget->allow_connect and gadget->deactivate. The routines * usb_gadget_connect_locked(), usb_gadget_disconnect_locked(), * usb_udc_connect_control_locked(), usb_gadget_udc_start_locked() and * usb_gadget_udc_stop_locked() are called with this lock held. * * This represents the internal data structure which is used by the UDC-class * to hold information about udc driver and gadget together. */ struct usb_udc { struct usb_gadget_driver *driver; struct usb_gadget *gadget; struct device dev; struct list_head list; bool vbus; bool started; bool allow_connect; struct work_struct vbus_work; struct mutex connect_lock; }; static const struct class udc_class; static LIST_HEAD(udc_list); /* Protects udc_list, udc->driver, driver->is_bound, and related calls */ static DEFINE_MUTEX(udc_lock); /* ------------------------------------------------------------------------- */ /** * usb_ep_set_maxpacket_limit - set maximum packet size limit for endpoint * @ep:the endpoint being configured * @maxpacket_limit:value of maximum packet size limit * * This function should be used only in UDC drivers to initialize endpoint * (usually in probe function). */ void usb_ep_set_maxpacket_limit(struct usb_ep *ep, unsigned maxpacket_limit) { ep->maxpacket_limit = maxpacket_limit; ep->maxpacket = maxpacket_limit; trace_usb_ep_set_maxpacket_limit(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_set_maxpacket_limit); /** * usb_ep_enable - configure endpoint, making it usable * @ep:the endpoint being configured. may not be the endpoint named "ep0". * drivers discover endpoints through the ep_list of a usb_gadget. * * When configurations are set, or when interface settings change, the driver * will enable or disable the relevant endpoints. while it is enabled, an * endpoint may be used for i/o until the driver receives a disconnect() from * the host or until the endpoint is disabled. * * the ep0 implementation (which calls this routine) must ensure that the * hardware capabilities of each endpoint match the descriptor provided * for it. for example, an endpoint named "ep2in-bulk" would be usable * for interrupt transfers as well as bulk, but it likely couldn't be used * for iso transfers or for endpoint 14. some endpoints are fully * configurable, with more generic names like "ep-a". (remember that for * USB, "in" means "towards the USB host".) * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_enable(struct usb_ep *ep) { int ret = 0; if (ep->enabled) goto out; /* UDC drivers can't handle endpoints with maxpacket size 0 */ if (!ep->desc || usb_endpoint_maxp(ep->desc) == 0) { WARN_ONCE(1, "%s: ep%d (%s) has %s\n", __func__, ep->address, ep->name, (!ep->desc) ? "NULL descriptor" : "maxpacket 0"); ret = -EINVAL; goto out; } ret = ep->ops->enable(ep, ep->desc); if (ret) goto out; ep->enabled = true; out: trace_usb_ep_enable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_enable); /** * usb_ep_disable - endpoint is no longer usable * @ep:the endpoint being unconfigured. may not be the endpoint named "ep0". * * no other task may be using this endpoint when this is called. * any pending and uncompleted requests will complete with status * indicating disconnect (-ESHUTDOWN) before this call returns. * gadget drivers must call usb_ep_enable() again before queueing * requests to the endpoint. * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_disable(struct usb_ep *ep) { int ret = 0; if (!ep->enabled) goto out; ret = ep->ops->disable(ep); if (ret) goto out; ep->enabled = false; out: trace_usb_ep_disable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_disable); /** * usb_ep_alloc_request - allocate a request object to use with this endpoint * @ep:the endpoint to be used with with the request * @gfp_flags:GFP_* flags to use * * Request objects must be allocated with this call, since they normally * need controller-specific setup and may even need endpoint-specific * resources such as allocation of DMA descriptors. * Requests may be submitted with usb_ep_queue(), and receive a single * completion callback. Free requests with usb_ep_free_request(), when * they are no longer needed. * * Returns the request, or null if one could not be allocated. */ struct usb_request *usb_ep_alloc_request(struct usb_ep *ep, gfp_t gfp_flags) { struct usb_request *req = NULL; req = ep->ops->alloc_request(ep, gfp_flags); if (req) req->ep = ep; trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM); return req; } EXPORT_SYMBOL_GPL(usb_ep_alloc_request); /** * usb_ep_free_request - frees a request object * @ep:the endpoint associated with the request * @req:the request being freed * * Reverses the effect of usb_ep_alloc_request(). * Caller guarantees the request is not queued, and that it will * no longer be requeued (or otherwise used). */ void usb_ep_free_request(struct usb_ep *ep, struct usb_request *req) { trace_usb_ep_free_request(ep, req, 0); ep->ops->free_request(ep, req); } EXPORT_SYMBOL_GPL(usb_ep_free_request); /** * usb_ep_queue - queues (submits) an I/O request to an endpoint. * @ep:the endpoint associated with the request * @req:the request being submitted * @gfp_flags: GFP_* flags to use in case the lower level driver couldn't * pre-allocate all necessary memory with the request. * * This tells the device controller to perform the specified request through * that endpoint (reading or writing a buffer). When the request completes, * including being canceled by usb_ep_dequeue(), the request's completion * routine is called to return the request to the driver. Any endpoint * (except control endpoints like ep0) may have more than one transfer * request queued; they complete in FIFO order. Once a gadget driver * submits a request, that request may not be examined or modified until it * is given back to that driver through the completion callback. * * Each request is turned into one or more packets. The controller driver * never merges adjacent requests into the same packet. OUT transfers * will sometimes use data that's already buffered in the hardware. * Drivers can rely on the fact that the first byte of the request's buffer * always corresponds to the first byte of some USB packet, for both * IN and OUT transfers. * * Bulk endpoints can queue any amount of data; the transfer is packetized * automatically. The last packet will be short if the request doesn't fill it * out completely. Zero length packets (ZLPs) should be avoided in portable * protocols since not all usb hardware can successfully handle zero length * packets. (ZLPs may be explicitly written, and may be implicitly written if * the request 'zero' flag is set.) Bulk endpoints may also be used * for interrupt transfers; but the reverse is not true, and some endpoints * won't support every interrupt transfer. (Such as 768 byte packets.) * * Interrupt-only endpoints are less functional than bulk endpoints, for * example by not supporting queueing or not handling buffers that are * larger than the endpoint's maxpacket size. They may also treat data * toggle differently. * * Control endpoints ... after getting a setup() callback, the driver queues * one response (even if it would be zero length). That enables the * status ack, after transferring data as specified in the response. Setup * functions may return negative error codes to generate protocol stalls. * (Note that some USB device controllers disallow protocol stall responses * in some cases.) When control responses are deferred (the response is * written after the setup callback returns), then usb_ep_set_halt() may be * used on ep0 to trigger protocol stalls. Depending on the controller, * it may not be possible to trigger a status-stage protocol stall when the * data stage is over, that is, from within the response's completion * routine. * * For periodic endpoints, like interrupt or isochronous ones, the usb host * arranges to poll once per interval, and the gadget driver usually will * have queued some data to transfer at that time. * * Note that @req's ->complete() callback must never be called from * within usb_ep_queue() as that can create deadlock situations. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. Endpoints that are not enabled * report errors; errors will also be * reported when the usb peripheral is disconnected. * * If and only if @req is successfully queued (the return value is zero), * @req->complete() will be called exactly once, when the Gadget core and * UDC are finished with the request. When the completion function is called, * control of the request is returned to the device driver which submitted it. * The completion handler may then immediately free or reuse @req. */ int usb_ep_queue(struct usb_ep *ep, struct usb_request *req, gfp_t gfp_flags) { int ret = 0; if (!ep->enabled && ep->address) { pr_debug("USB gadget: queue request to disabled ep 0x%x (%s)\n", ep->address, ep->name); ret = -ESHUTDOWN; goto out; } ret = ep->ops->queue(ep, req, gfp_flags); out: trace_usb_ep_queue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_queue); /** * usb_ep_dequeue - dequeues (cancels, unlinks) an I/O request from an endpoint * @ep:the endpoint associated with the request * @req:the request being canceled * * If the request is still active on the endpoint, it is dequeued and * eventually its completion routine is called (with status -ECONNRESET); * else a negative error code is returned. This routine is asynchronous, * that is, it may return before the completion routine runs. * * Note that some hardware can't clear out write fifos (to unlink the request * at the head of the queue) except as part of disconnecting from usb. Such * restrictions prevent drivers from supporting configuration changes, * even to configuration zero (a "chapter 9" requirement). * * This routine may be called in interrupt context. */ int usb_ep_dequeue(struct usb_ep *ep, struct usb_request *req) { int ret; ret = ep->ops->dequeue(ep, req); trace_usb_ep_dequeue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_dequeue); /** * usb_ep_set_halt - sets the endpoint halt feature. * @ep: the non-isochronous endpoint being stalled * * Use this to stall an endpoint, perhaps as an error report. * Except for control endpoints, * the endpoint stays halted (will not stream any data) until the host * clears this feature; drivers may need to empty the endpoint's request * queue first, to make sure no inappropriate transfers happen. * * Note that while an endpoint CLEAR_FEATURE will be invisible to the * gadget driver, a SET_INTERFACE will not be. To reset endpoints for the * current altsetting, see usb_ep_clear_halt(). When switching altsettings, * it's simplest to use usb_ep_enable() or usb_ep_disable() for the endpoints. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call sets * underlying hardware state that blocks data transfers. * Attempts to halt IN endpoints will fail (returning -EAGAIN) if any * transfer requests are still queued, or if the controller hardware * (usually a FIFO) still holds bytes that the host hasn't collected. */ int usb_ep_set_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_halt); /** * usb_ep_clear_halt - clears endpoint halt, and resets toggle * @ep:the bulk or interrupt endpoint being reset * * Use this when responding to the standard usb "set interface" request, * for endpoints that aren't reconfigured, after clearing any other state * in the endpoint's i/o queue. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call clears * the underlying hardware state reflecting endpoint halt and data toggle. * Note that some hardware can't support this request (like pxa2xx_udc), * and accordingly can't correctly implement interface altsettings. */ int usb_ep_clear_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 0); trace_usb_ep_clear_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_clear_halt); /** * usb_ep_set_wedge - sets the halt feature and ignores clear requests * @ep: the endpoint being wedged * * Use this to stall an endpoint and ignore CLEAR_FEATURE(HALT_ENDPOINT) * requests. If the gadget driver clears the halt status, it will * automatically unwedge the endpoint. * * This routine may be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_ep_set_wedge(struct usb_ep *ep) { int ret; if (ep->ops->set_wedge) ret = ep->ops->set_wedge(ep); else ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_wedge(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_wedge); /** * usb_ep_fifo_status - returns number of bytes in fifo, or error * @ep: the endpoint whose fifo status is being checked. * * FIFO endpoints may have "unclaimed data" in them in certain cases, * such as after aborted transfers. Hosts may not have collected all * the IN data written by the gadget driver (and reported by a request * completion). The gadget driver may not have collected all the data * written OUT to it by the host. Drivers that need precise handling for * fault reporting or recovery may need to use this call. * * This routine may be called in interrupt context. * * This returns the number of such bytes in the fifo, or a negative * errno if the endpoint doesn't use a FIFO or doesn't support such * precise handling. */ int usb_ep_fifo_status(struct usb_ep *ep) { int ret; if (ep->ops->fifo_status) ret = ep->ops->fifo_status(ep); else ret = -EOPNOTSUPP; trace_usb_ep_fifo_status(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_fifo_status); /** * usb_ep_fifo_flush - flushes contents of a fifo * @ep: the endpoint whose fifo is being flushed. * * This call may be used to flush the "unclaimed data" that may exist in * an endpoint fifo after abnormal transaction terminations. The call * must never be used except when endpoint is not being used for any * protocol translation. * * This routine may be called in interrupt context. */ void usb_ep_fifo_flush(struct usb_ep *ep) { if (ep->ops->fifo_flush) ep->ops->fifo_flush(ep); trace_usb_ep_fifo_flush(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_fifo_flush); /* ------------------------------------------------------------------------- */ /** * usb_gadget_frame_number - returns the current frame number * @gadget: controller that reports the frame number * * Returns the usb frame number, normally eleven bits from a SOF packet, * or negative errno if this device doesn't support this capability. */ int usb_gadget_frame_number(struct usb_gadget *gadget) { int ret; ret = gadget->ops->get_frame(gadget); trace_usb_gadget_frame_number(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_frame_number); /** * usb_gadget_wakeup - tries to wake up the host connected to this gadget * @gadget: controller used to wake up the host * * Returns zero on success, else negative error code if the hardware * doesn't support such attempts, or its support has not been enabled * by the usb host. Drivers must return device descriptors that report * their ability to support this, or hosts won't enable it. * * This may also try to use SRP to wake the host and start enumeration, * even if OTG isn't otherwise in use. OTG devices may also start * remote wakeup even when hosts don't explicitly enable it. */ int usb_gadget_wakeup(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->wakeup(gadget); out: trace_usb_gadget_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_wakeup); /** * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature. * @gadget:the device being configured for remote wakeup * @set:value to be configured. * * set to one to enable remote wakeup feature and zero to disable it. * * returns zero on success, else negative errno. */ int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set) { int ret = 0; if (!gadget->ops->set_remote_wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_remote_wakeup(gadget, set); out: trace_usb_gadget_set_remote_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup); /** * usb_gadget_set_selfpowered - sets the device selfpowered feature. * @gadget:the device being declared as self-powered * * this affects the device status reported by the hardware driver * to reflect that it now has a local power supply. * * returns zero on success, else negative errno. */ int usb_gadget_set_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 1); out: trace_usb_gadget_set_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_selfpowered); /** * usb_gadget_clear_selfpowered - clear the device selfpowered feature. * @gadget:the device being declared as bus-powered * * this affects the device status reported by the hardware driver. * some hardware may not support bus-powered operation, in which * case this feature's value can never change. * * returns zero on success, else negative errno. */ int usb_gadget_clear_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 0); out: trace_usb_gadget_clear_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_clear_selfpowered); /** * usb_gadget_vbus_connect - Notify controller that VBUS is powered * @gadget:The device which now has VBUS power. * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session starting. Common responses include * resuming the controller, activating the D+ (or D-) pullup to let the * host detect that a USB device is attached, and starting to draw power * (8mA or possibly more, especially after SET_CONFIGURATION). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_connect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 1); out: trace_usb_gadget_vbus_connect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_connect); /** * usb_gadget_vbus_draw - constrain controller's VBUS power usage * @gadget:The device whose VBUS usage is being described * @mA:How much current to draw, in milliAmperes. This should be twice * the value listed in the configuration descriptor bMaxPower field. * * This call is used by gadget drivers during SET_CONFIGURATION calls, * reporting how much power the device may consume. For example, this * could affect how quickly batteries are recharged. * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_draw(struct usb_gadget *gadget, unsigned mA) { int ret = 0; if (!gadget->ops->vbus_draw) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_draw(gadget, mA); if (!ret) gadget->mA = mA; out: trace_usb_gadget_vbus_draw(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_draw); /** * usb_gadget_vbus_disconnect - notify controller about VBUS session end * @gadget:the device whose VBUS supply is being described * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session ending. Common responses include * reversing everything done in usb_gadget_vbus_connect(). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_disconnect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 0); out: trace_usb_gadget_vbus_disconnect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect); static int usb_gadget_connect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (gadget->deactivated || !gadget->udc->allow_connect || !gadget->udc->started) { /* * If the gadget isn't usable (because it is deactivated, * unbound, or not yet started), we only save the new state. * The gadget will be connected automatically when it is * activated/bound/started. */ gadget->connected = true; goto out; } ret = gadget->ops->pullup(gadget, 1); if (!ret) gadget->connected = 1; out: trace_usb_gadget_connect(gadget, ret); return ret; } /** * usb_gadget_connect - software-controlled connect to USB host * @gadget:the peripheral being connected * * Enables the D+ (or potentially D-) pullup. The host will start * enumerating this gadget when the pullup is active and a VBUS session * is active (the link is powered). * * Returns zero on success, else negative errno. */ int usb_gadget_connect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_connect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_connect); static int usb_gadget_disconnect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (!gadget->connected) goto out; if (gadget->deactivated || !gadget->udc->started) { /* * If gadget is deactivated we only save new state. * Gadget will stay disconnected after activation. */ gadget->connected = false; goto out; } ret = gadget->ops->pullup(gadget, 0); if (!ret) gadget->connected = 0; mutex_lock(&udc_lock); if (gadget->udc->driver) gadget->udc->driver->disconnect(gadget); mutex_unlock(&udc_lock); out: trace_usb_gadget_disconnect(gadget, ret); return ret; } /** * usb_gadget_disconnect - software-controlled disconnect from USB host * @gadget:the peripheral being disconnected * * Disables the D+ (or potentially D-) pullup, which the host may see * as a disconnect (when a VBUS session is active). Not all systems * support software pullup controls. * * Following a successful disconnect, invoke the ->disconnect() callback * for the current gadget driver so that UDC drivers don't need to. * * Returns zero on success, else negative errno. */ int usb_gadget_disconnect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_disconnect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_disconnect); /** * usb_gadget_deactivate - deactivate function which is not ready to work * @gadget: the peripheral being deactivated * * This routine may be used during the gadget driver bind() call to prevent * the peripheral from ever being visible to the USB host, unless later * usb_gadget_activate() is called. For example, user mode components may * need to be activated before the system can talk to hosts. * * This routine may sleep; it must not be called in interrupt context * (such as from within a gadget driver's disconnect() callback). * * Returns zero on success, else negative errno. */ int usb_gadget_deactivate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (gadget->deactivated) goto unlock; if (gadget->connected) { ret = usb_gadget_disconnect_locked(gadget); if (ret) goto unlock; /* * If gadget was being connected before deactivation, we want * to reconnect it in usb_gadget_activate(). */ gadget->connected = true; } gadget->deactivated = true; unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_deactivate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_deactivate); /** * usb_gadget_activate - activate function which is not ready to work * @gadget: the peripheral being activated * * This routine activates gadget which was previously deactivated with * usb_gadget_deactivate() call. It calls usb_gadget_connect() if needed. * * This routine may sleep; it must not be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_gadget_activate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (!gadget->deactivated) goto unlock; gadget->deactivated = false; /* * If gadget has been connected before deactivation, or became connected * while it was being deactivated, we call usb_gadget_connect(). */ if (gadget->connected) ret = usb_gadget_connect_locked(gadget); unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_activate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_activate); /* ------------------------------------------------------------------------- */ #ifdef CONFIG_HAS_DMA int usb_gadget_map_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0) return 0; if (req->sg_was_mapped) { req->num_mapped_sgs = req->num_sgs; return 0; } if (req->num_sgs) { int mapped; mapped = dma_map_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (mapped == 0) { dev_err(dev, "failed to map SGs\n"); return -EFAULT; } req->num_mapped_sgs = mapped; } else { if (is_vmalloc_addr(req->buf)) { dev_err(dev, "buffer is not dma capable\n"); return -EFAULT; } else if (object_is_on_stack(req->buf)) { dev_err(dev, "buffer is on stack\n"); return -EFAULT; } req->dma = dma_map_single(dev, req->buf, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (dma_mapping_error(dev, req->dma)) { dev_err(dev, "failed to map buffer\n"); return -EFAULT; } req->dma_mapped = 1; } return 0; } EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev); int usb_gadget_map_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_map_request); void usb_gadget_unmap_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0 || req->sg_was_mapped) return; if (req->num_mapped_sgs) { dma_unmap_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->num_mapped_sgs = 0; } else if (req->dma_mapped) { dma_unmap_single(dev, req->dma, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->dma_mapped = 0; } } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); void usb_gadget_unmap_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request); #endif /* CONFIG_HAS_DMA */ /* ------------------------------------------------------------------------- */ /** * usb_gadget_giveback_request - give the request back to the gadget layer * @ep: the endpoint to be used with with the request * @req: the request being given back * * This is called by device controller drivers in order to return the * completed request back to the gadget layer. */ void usb_gadget_giveback_request(struct usb_ep *ep, struct usb_request *req) { if (likely(req->status == 0)) usb_led_activity(USB_LED_EVENT_GADGET); trace_usb_gadget_giveback_request(ep, req, 0); req->complete(ep, req); } EXPORT_SYMBOL_GPL(usb_gadget_giveback_request); /* ------------------------------------------------------------------------- */ /** * gadget_find_ep_by_name - returns ep whose name is the same as sting passed * in second parameter or NULL if searched endpoint not found * @g: controller to check for quirk * @name: name of searched endpoint */ struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g, const char *name) { struct usb_ep *ep; gadget_for_each_ep(ep, g) { if (!strcmp(ep->name, name)) return ep; } return NULL; } EXPORT_SYMBOL_GPL(gadget_find_ep_by_name); /* ------------------------------------------------------------------------- */ int usb_gadget_ep_match_desc(struct usb_gadget *gadget, struct usb_ep *ep, struct usb_endpoint_descriptor *desc, struct usb_ss_ep_comp_descriptor *ep_comp) { u8 type; u16 max; int num_req_streams = 0; /* endpoint already claimed? */ if (ep->claimed) return 0; type = usb_endpoint_type(desc); max = usb_endpoint_maxp(desc); if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in) return 0; if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out) return 0; if (max > ep->maxpacket_limit) return 0; /* "high bandwidth" works only at high speed */ if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp_mult(desc) > 1) return 0; switch (type) { case USB_ENDPOINT_XFER_CONTROL: /* only support ep0 for portable CONTROL traffic */ return 0; case USB_ENDPOINT_XFER_ISOC: if (!ep->caps.type_iso) return 0; /* ISO: limit 1023 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 1023) return 0; break; case USB_ENDPOINT_XFER_BULK: if (!ep->caps.type_bulk) return 0; if (ep_comp && gadget_is_superspeed(gadget)) { /* Get the number of required streams from the * EP companion descriptor and see if the EP * matches it */ num_req_streams = ep_comp->bmAttributes & 0x1f; if (num_req_streams > ep->max_streams) return 0; } break; case USB_ENDPOINT_XFER_INT: /* Bulk endpoints handle interrupt transfers, * except the toggle-quirky iso-synch kind */ if (!ep->caps.type_int && !ep->caps.type_bulk) return 0; /* INT: limit 64 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 64) return 0; break; } return 1; } EXPORT_SYMBOL_GPL(usb_gadget_ep_match_desc); /** * usb_gadget_check_config - checks if the UDC can support the binded * configuration * @gadget: controller to check the USB configuration * * Ensure that a UDC is able to support the requested resources by a * configuration, and that there are no resource limitations, such as * internal memory allocated to all requested endpoints. * * Returns zero on success, else a negative errno. */ int usb_gadget_check_config(struct usb_gadget *gadget) { if (gadget->ops->check_config) return gadget->ops->check_config(gadget); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_check_config); /* ------------------------------------------------------------------------- */ static void usb_gadget_state_work(struct work_struct *work) { struct usb_gadget *gadget = work_to_gadget(work); struct usb_udc *udc = gadget->udc; if (udc) sysfs_notify(&udc->dev.kobj, NULL, "state"); } void usb_gadget_set_state(struct usb_gadget *gadget, enum usb_device_state state) { unsigned long flags; spin_lock_irqsave(&gadget->state_lock, flags); gadget->state = state; if (!gadget->teardown) schedule_work(&gadget->work); spin_unlock_irqrestore(&gadget->state_lock, flags); trace_usb_gadget_set_state(gadget, 0); } EXPORT_SYMBOL_GPL(usb_gadget_set_state); /* ------------------------------------------------------------------------- */ /* Acquire connect_lock before calling this function. */ static int usb_udc_connect_control_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (udc->vbus) return usb_gadget_connect_locked(udc->gadget); else return usb_gadget_disconnect_locked(udc->gadget); } static void vbus_event_work(struct work_struct *work) { struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work); mutex_lock(&udc->connect_lock); usb_udc_connect_control_locked(udc); mutex_unlock(&udc->connect_lock); } /** * usb_udc_vbus_handler - updates the udc core vbus status, and try to * connect or disconnect gadget * @gadget: The gadget which vbus change occurs * @status: The vbus status * * The udc driver calls it when it wants to connect or disconnect gadget * according to vbus status. * * This function can be invoked from interrupt context by irq handlers of * the gadget drivers, however, usb_udc_connect_control() has to run in * non-atomic context due to the following: * a. Some of the gadget driver implementations expect the ->pullup * callback to be invoked in non-atomic context. * b. usb_gadget_disconnect() acquires udc_lock which is a mutex. * Hence offload invocation of usb_udc_connect_control() to workqueue. */ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status) { struct usb_udc *udc = gadget->udc; if (udc) { udc->vbus = status; schedule_work(&udc->vbus_work); } } EXPORT_SYMBOL_GPL(usb_udc_vbus_handler); /** * usb_gadget_udc_reset - notifies the udc core that bus reset occurs * @gadget: The gadget which bus reset occurs * @driver: The gadget driver we want to notify * * If the udc driver has bus reset handler, it needs to call this when the bus * reset occurs, it notifies the gadget driver that the bus reset occurs as * well as updates gadget state. */ void usb_gadget_udc_reset(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { driver->reset(gadget); usb_gadget_set_state(gadget, USB_STATE_DEFAULT); } EXPORT_SYMBOL_GPL(usb_gadget_udc_reset); /** * usb_gadget_udc_start_locked - tells usb device controller to start up * @udc: The UDC to be started * * This call is issued by the UDC Class driver when it's about * to register a gadget driver to the device controller, before * calling gadget driver's bind() method. * * It allows the controller to be powered off until strictly * necessary to have it powered on. * * Returns zero on success, else negative errno. * * Caller should acquire connect_lock before invoking this function. */ static inline int usb_gadget_udc_start_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { int ret; if (udc->started) { dev_err(&udc->dev, "UDC had already started\n"); return -EBUSY; } ret = udc->gadget->ops->udc_start(udc->gadget, udc->driver); if (!ret) udc->started = true; return ret; } /** * usb_gadget_udc_stop_locked - tells usb device controller we don't need it anymore * @udc: The UDC to be stopped * * This call is issued by the UDC Class driver after calling * gadget driver's unbind() method. * * The details are implementation specific, but it can go as * far as powering off UDC completely and disable its data * line pullups. * * Caller should acquire connect lock before invoking this function. */ static inline void usb_gadget_udc_stop_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (!udc->started) { dev_err(&udc->dev, "UDC had already stopped\n"); return; } udc->gadget->ops->udc_stop(udc->gadget); udc->started = false; } /** * usb_gadget_udc_set_speed - tells usb device controller speed supported by * current driver * @udc: The device we want to set maximum speed * @speed: The maximum speed to allowed to run * * This call is issued by the UDC Class driver before calling * usb_gadget_udc_start() in order to make sure that we don't try to * connect on speeds the gadget driver doesn't support. */ static inline void usb_gadget_udc_set_speed(struct usb_udc *udc, enum usb_device_speed speed) { struct usb_gadget *gadget = udc->gadget; enum usb_device_speed s; if (speed == USB_SPEED_UNKNOWN) s = gadget->max_speed; else s = min(speed, gadget->max_speed); if (s == USB_SPEED_SUPER_PLUS && gadget->ops->udc_set_ssp_rate) gadget->ops->udc_set_ssp_rate(gadget, gadget->max_ssp_rate); else if (gadget->ops->udc_set_speed) gadget->ops->udc_set_speed(gadget, s); } /** * usb_gadget_enable_async_callbacks - tell usb device controller to enable asynchronous callbacks * @udc: The UDC which should enable async callbacks * * This routine is used when binding gadget drivers. It undoes the effect * of usb_gadget_disable_async_callbacks(); the UDC driver should enable IRQs * (if necessary) and resume issuing callbacks. * * This routine will always be called in process context. */ static inline void usb_gadget_enable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, true); } /** * usb_gadget_disable_async_callbacks - tell usb device controller to disable asynchronous callbacks * @udc: The UDC which should disable async callbacks * * This routine is used when unbinding gadget drivers. It prevents a race: * The UDC driver doesn't know when the gadget driver's ->unbind callback * runs, so unless it is told to disable asynchronous callbacks, it might * issue a callback (such as ->disconnect) after the unbind has completed. * * After this function runs, the UDC driver must suppress all ->suspend, * ->resume, ->disconnect, ->reset, and ->setup callbacks to the gadget driver * until async callbacks are again enabled. A simple-minded but effective * way to accomplish this is to tell the UDC hardware not to generate any * more IRQs. * * Request completion callbacks must still be issued. However, it's okay * to defer them until the request is cancelled, since the pull-up will be * turned off during the time period when async callbacks are disabled. * * This routine will always be called in process context. */ static inline void usb_gadget_disable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, false); } /** * usb_udc_release - release the usb_udc struct * @dev: the dev member within usb_udc * * This is called by driver's core in order to free memory once the last * reference is released. */ static void usb_udc_release(struct device *dev) { struct usb_udc *udc; udc = container_of(dev, struct usb_udc, dev); dev_dbg(dev, "releasing '%s'\n", dev_name(dev)); kfree(udc); } static const struct attribute_group *usb_udc_attr_groups[]; static void usb_udc_nop_release(struct device *dev) { dev_vdbg(dev, "%s\n", __func__); } /** * usb_initialize_gadget - initialize a gadget and its embedded struct device * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be initialized. * @release: a gadget release function. */ void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { spin_lock_init(&gadget->state_lock); gadget->teardown = false; INIT_WORK(&gadget->work, usb_gadget_state_work); gadget->dev.parent = parent; if (release) gadget->dev.release = release; else gadget->dev.release = usb_udc_nop_release; device_initialize(&gadget->dev); gadget->dev.bus = &gadget_bus_type; } EXPORT_SYMBOL_GPL(usb_initialize_gadget); /** * usb_add_gadget - adds a new gadget to the udc class driver list * @gadget: the gadget to be added to the list. * * Returns zero on success, negative errno otherwise. * Does not do a final usb_put_gadget() if an error occurs. */ int usb_add_gadget(struct usb_gadget *gadget) { struct usb_udc *udc; int ret = -ENOMEM; udc = kzalloc_obj(*udc); if (!udc) goto error; device_initialize(&udc->dev); udc->dev.release = usb_udc_release; udc->dev.class = &udc_class; udc->dev.groups = usb_udc_attr_groups; udc->dev.parent = gadget->dev.parent; ret = dev_set_name(&udc->dev, "%s", kobject_name(&gadget->dev.parent->kobj)); if (ret) goto err_put_udc; udc->gadget = gadget; gadget->udc = udc; mutex_init(&udc->connect_lock); udc->started = false; mutex_lock(&udc_lock); list_add_tail(&udc->list, &udc_list); mutex_unlock(&udc_lock); INIT_WORK(&udc->vbus_work, vbus_event_work); ret = device_add(&udc->dev); if (ret) goto err_unlist_udc; usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED); udc->vbus = true; ret = ida_alloc(&gadget_id_numbers, GFP_KERNEL); if (ret < 0) goto err_del_udc; gadget->id_number = ret; dev_set_name(&gadget->dev, "gadget.%d", ret); ret = device_add(&gadget->dev); if (ret) goto err_free_id; ret = sysfs_create_link(&udc->dev.kobj, &gadget->dev.kobj, "gadget"); if (ret) goto err_del_gadget; return 0; err_del_gadget: device_del(&gadget->dev); err_free_id: ida_free(&gadget_id_numbers, gadget->id_number); err_del_udc: flush_work(&gadget->work); device_del(&udc->dev); err_unlist_udc: mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); err_put_udc: put_device(&udc->dev); error: return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget); /** * usb_add_gadget_udc_release - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be added to the list. * @release: a gadget release function. * * Returns zero on success, negative errno otherwise. * Calls the gadget release function in the latter case. */ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { int ret; usb_initialize_gadget(parent, gadget, release); ret = usb_add_gadget(gadget); if (ret) usb_put_gadget(gadget); return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget_udc_release); /** * usb_get_gadget_udc_name - get the name of the first UDC controller * This functions returns the name of the first UDC controller in the system. * Please note that this interface is usefull only for legacy drivers which * assume that there is only one UDC controller in the system and they need to * get its name before initialization. There is no guarantee that the UDC * of the returned name will be still available, when gadget driver registers * itself. * * Returns pointer to string with UDC controller name on success, NULL * otherwise. Caller should kfree() returned string. */ char *usb_get_gadget_udc_name(void) { struct usb_udc *udc; char *name = NULL; /* For now we take the first available UDC */ mutex_lock(&udc_lock); list_for_each_entry(udc, &udc_list, list) { if (!udc->driver) { name = kstrdup(udc->gadget->name, GFP_KERNEL); break; } } mutex_unlock(&udc_lock); return name; } EXPORT_SYMBOL_GPL(usb_get_gadget_udc_name); /** * usb_add_gadget_udc - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller * driver's device. * @gadget: the gadget to be added to the list * * Returns zero on success, negative errno otherwise. */ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget) { return usb_add_gadget_udc_release(parent, gadget, NULL); } EXPORT_SYMBOL_GPL(usb_add_gadget_udc); /** * usb_del_gadget - deletes a gadget and unregisters its udc * @gadget: the gadget to be deleted. * * This will unbind @gadget, if it is bound. * It will not do a final usb_put_gadget(). */ void usb_del_gadget(struct usb_gadget *gadget) { struct usb_udc *udc = gadget->udc; unsigned long flags; if (!udc) return; dev_vdbg(gadget->dev.parent, "unregistering gadget\n"); mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); device_del(&gadget->dev); /* * Set the teardown flag before flushing the work to prevent new work * from being scheduled while we are cleaning up. */ spin_lock_irqsave(&gadget->state_lock, flags); gadget->teardown = true; spin_unlock_irqrestore(&gadget->state_lock, flags); flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); } EXPORT_SYMBOL_GPL(usb_del_gadget); /** * usb_del_gadget_udc - unregisters a gadget * @gadget: the gadget to be unregistered. * * Calls usb_del_gadget() and does a final usb_put_gadget(). */ void usb_del_gadget_udc(struct usb_gadget *gadget) { usb_del_gadget(gadget); usb_put_gadget(gadget); } EXPORT_SYMBOL_GPL(usb_del_gadget_udc); /* ------------------------------------------------------------------------- */ static int gadget_match_driver(struct device *dev, const struct device_driver *drv) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; const struct usb_gadget_driver *driver = container_of(drv, struct usb_gadget_driver, driver); /* If the driver specifies a udc_name, it must match the UDC's name */ if (driver->udc_name && strcmp(driver->udc_name, dev_name(&udc->dev)) != 0) return 0; /* If the driver is already bound to a gadget, it doesn't match */ if (driver->is_bound) return 0; /* Otherwise any gadget driver matches any UDC */ return 1; } static int gadget_bind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = container_of(dev->driver, struct usb_gadget_driver, driver); int ret = 0; mutex_lock(&udc_lock); if (driver->is_bound) { mutex_unlock(&udc_lock); return -ENXIO; /* Driver binds to only one gadget */ } driver->is_bound = true; udc->driver = driver; mutex_unlock(&udc_lock); dev_dbg(&udc->dev, "binding gadget driver [%s]\n", driver->function); usb_gadget_udc_set_speed(udc, driver->max_speed); ret = driver->bind(udc->gadget, driver); if (ret) goto err_bind; mutex_lock(&udc->connect_lock); ret = usb_gadget_udc_start_locked(udc); if (ret) { mutex_unlock(&udc->connect_lock); goto err_start; } usb_gadget_enable_async_callbacks(udc); udc->allow_connect = true; ret = usb_udc_connect_control_locked(udc); if (ret) goto err_connect_control; mutex_unlock(&udc->connect_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); return 0; err_connect_control: udc->allow_connect = false; usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); err_start: driver->unbind(udc->gadget); err_bind: if (ret != -EISNAM) dev_err(&udc->dev, "failed to start %s: %d\n", driver->function, ret); mutex_lock(&udc_lock); udc->driver = NULL; driver->is_bound = false; mutex_unlock(&udc_lock); return ret; } static void gadget_unbind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = udc->driver; dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function); udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(gadget); usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); mutex_unlock(&udc->connect_lock); udc->driver->unbind(gadget); mutex_lock(&udc->connect_lock); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); mutex_lock(&udc_lock); driver->is_bound = false; udc->driver = NULL; mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); } /* ------------------------------------------------------------------------- */ int usb_gadget_register_driver_owner(struct usb_gadget_driver *driver, struct module *owner, const char *mod_name) { int ret; if (!driver || !driver->bind || !driver->setup) return -EINVAL; driver->driver.bus = &gadget_bus_type; driver->driver.owner = owner; driver->driver.mod_name = mod_name; driver->driver.probe_type = PROBE_FORCE_SYNCHRONOUS; ret = driver_register(&driver->driver); if (ret) { pr_warn("%s: driver registration failed: %d\n", driver->function, ret); return ret; } mutex_lock(&udc_lock); if (!driver->is_bound) { if (driver->match_existing_only) { pr_warn("%s: couldn't find an available UDC or it's busy\n", driver->function); ret = -EBUSY; } else { pr_info("%s: couldn't find an available UDC\n", driver->function); ret = 0; } } mutex_unlock(&udc_lock); if (ret) driver_unregister(&driver->driver); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_register_driver_owner); int usb_gadget_unregister_driver(struct usb_gadget_driver *driver) { if (!driver || !driver->unbind) return -EINVAL; driver_unregister(&driver->driver); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_unregister_driver); /* ------------------------------------------------------------------------- */ static ssize_t srp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); if (sysfs_streq(buf, "1")) usb_gadget_wakeup(udc->gadget); return n; } static DEVICE_ATTR_WO(srp); static ssize_t soft_connect_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); ssize_t ret; device_lock(&udc->gadget->dev); if (!udc->driver) { dev_err(dev, "soft-connect without a gadget driver\n"); ret = -EOPNOTSUPP; goto out; } if (sysfs_streq(buf, "connect")) { mutex_lock(&udc->connect_lock); usb_gadget_udc_start_locked(udc); usb_gadget_connect_locked(udc->gadget); mutex_unlock(&udc->connect_lock); } else if (sysfs_streq(buf, "disconnect")) { mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(udc->gadget); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); } else { dev_err(dev, "unsupported command '%s'\n", buf); ret = -EINVAL; goto out; } ret = n; out: device_unlock(&udc->gadget->dev); return ret; } static DEVICE_ATTR_WO(soft_connect); static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget *gadget = udc->gadget; return sprintf(buf, "%s\n", usb_state_string(gadget->state)); } static DEVICE_ATTR_RO(state); static ssize_t function_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget_driver *drv; int rc = 0; mutex_lock(&udc_lock); drv = udc->driver; if (drv && drv->function) rc = scnprintf(buf, PAGE_SIZE, "%s\n", drv->function); mutex_unlock(&udc_lock); return rc; } static DEVICE_ATTR_RO(function); #define USB_UDC_SPEED_ATTR(name, param) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ return scnprintf(buf, PAGE_SIZE, "%s\n", \ usb_speed_string(udc->gadget->param)); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_SPEED_ATTR(current_speed, speed); static USB_UDC_SPEED_ATTR(maximum_speed, max_speed); #define USB_UDC_ATTR(name) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ struct usb_gadget *gadget = udc->gadget; \ \ return scnprintf(buf, PAGE_SIZE, "%d\n", gadget->name); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_ATTR(is_otg); static USB_UDC_ATTR(is_a_peripheral); static USB_UDC_ATTR(b_hnp_enable); static USB_UDC_ATTR(a_hnp_support); static USB_UDC_ATTR(a_alt_hnp_support); static USB_UDC_ATTR(is_selfpowered); static struct attribute *usb_udc_attrs[] = { &dev_attr_srp.attr, &dev_attr_soft_connect.attr, &dev_attr_state.attr, &dev_attr_function.attr, &dev_attr_current_speed.attr, &dev_attr_maximum_speed.attr, &dev_attr_is_otg.attr, &dev_attr_is_a_peripheral.attr, &dev_attr_b_hnp_enable.attr, &dev_attr_a_hnp_support.attr, &dev_attr_a_alt_hnp_support.attr, &dev_attr_is_selfpowered.attr, NULL, }; static const struct attribute_group usb_udc_attr_group = { .attrs = usb_udc_attrs, }; static const struct attribute_group *usb_udc_attr_groups[] = { &usb_udc_attr_group, NULL, }; static int usb_udc_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_udc *udc = container_of(dev, struct usb_udc, dev); int ret; ret = add_uevent_var(env, "USB_UDC_NAME=%s", udc->gadget->name); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_NAME\n"); return ret; } mutex_lock(&udc_lock); if (udc->driver) ret = add_uevent_var(env, "USB_UDC_DRIVER=%s", udc->driver->function); mutex_unlock(&udc_lock); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_DRIVER\n"); return ret; } return 0; } static const struct class udc_class = { .name = "udc", .dev_uevent = usb_udc_uevent, }; static const struct bus_type gadget_bus_type = { .name = "gadget", .probe = gadget_bind_driver, .remove = gadget_unbind_driver, .match = gadget_match_driver, }; static int __init usb_udc_init(void) { int rc; rc = class_register(&udc_class); if (rc) return rc; rc = bus_register(&gadget_bus_type); if (rc) class_unregister(&udc_class); return rc; } subsys_initcall(usb_udc_init); static void __exit usb_udc_exit(void) { bus_unregister(&gadget_bus_type); class_unregister(&udc_class); } module_exit(usb_udc_exit); MODULE_DESCRIPTION("UDC Framework"); MODULE_AUTHOR("Felipe Balbi <balbi@ti.com>"); MODULE_LICENSE("GPL v2");
16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_QUEUE_H #define _NF_QUEUE_H #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/netfilter.h> #include <linux/rhashtable-types.h> #include <linux/skbuff.h> /* Each queued (to userspace) skbuff has one of these. */ struct nf_queue_entry { struct list_head list; struct rhash_head hash_node; struct sk_buff *skb; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) struct net_device *physin; struct net_device *physout; #endif struct nf_hook_state state; bool nf_ct_is_unconfirmed; u16 size; /* sizeof(entry) + saved route keys */ u16 queue_num; /* extra space to store route keys */ }; #define nf_queue_entry_reroute(x) ((void *)x + sizeof(struct nf_queue_entry)) /* Packet queuing */ struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); void (*nf_hook_drop)(struct net *net); }; void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) { while (*jhash_initval == 0) *jhash_initval = get_random_u32(); } static inline u32 hash_v4(const struct iphdr *iph, u32 initval) { /* packets in either direction go into same queue */ if ((__force u32)iph->saddr < (__force u32)iph->daddr) return jhash_3words((__force u32)iph->saddr, (__force u32)iph->daddr, iph->protocol, initval); return jhash_3words((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, initval); } static inline u32 hash_v6(const struct ipv6hdr *ip6h, u32 initval) { u32 a, b, c; if ((__force u32)ip6h->saddr.s6_addr32[3] < (__force u32)ip6h->daddr.s6_addr32[3]) { a = (__force u32) ip6h->saddr.s6_addr32[3]; b = (__force u32) ip6h->daddr.s6_addr32[3]; } else { b = (__force u32) ip6h->saddr.s6_addr32[3]; a = (__force u32) ip6h->daddr.s6_addr32[3]; } if ((__force u32)ip6h->saddr.s6_addr32[1] < (__force u32)ip6h->daddr.s6_addr32[1]) c = (__force u32) ip6h->saddr.s6_addr32[1]; else c = (__force u32) ip6h->daddr.s6_addr32[1]; return jhash_3words(a, b, c, initval); } static inline u32 hash_bridge(const struct sk_buff *skb, u32 initval) { struct ipv6hdr *ip6h, _ip6h; struct iphdr *iph, _iph; switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): iph = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*iph), &_iph); if (iph) return hash_v4(iph, initval); break; case htons(ETH_P_IPV6): ip6h = skb_header_pointer(skb, skb_network_offset(skb), sizeof(*ip6h), &_ip6h); if (ip6h) return hash_v6(ip6h, initval); break; } return 0; } static inline u32 nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, u32 initval) { switch (family) { case NFPROTO_IPV4: queue += reciprocal_scale(hash_v4(ip_hdr(skb), initval), queues_total); break; case NFPROTO_IPV6: queue += reciprocal_scale(hash_v6(ipv6_hdr(skb), initval), queues_total); break; case NFPROTO_BRIDGE: queue += reciprocal_scale(hash_bridge(skb, initval), queues_total); break; } return queue; } int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int index, unsigned int verdict); #endif /* _NF_QUEUE_H */
495 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer, \ (lock)->dep_map.lock_type) /** * lockdep_set_novalidate_class: disable checking of lock ordering on a given * lock * @lock: Lock to mark * * Lockdep will still record that this lock has been taken, and print held * instances when dumping locks */ #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /** * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely * @lock: Lock to mark * * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs, * which takes more locks than lockdep is able to track (48). */ #define lockdep_set_notrack_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); extern void lock_sync(struct lockdep_map *lock, unsigned int subclass, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); /* lock_is_held_type() returns */ #define LOCK_STATE_UNKNOWN -1 #define LOCK_STATE_NOT_HELD 0 #define LOCK_STATE_HELD 1 /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); #define lock_set_novalidate_class(l, n, i) \ lock_set_class(l, n, &__lockdep_no_validate__, 0, i) static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert(cond) \ do { WARN_ON(debug_locks && !(cond)); } while (0) #define lockdep_assert_once(cond) \ do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0) #define lockdep_assert_held(l) \ do { lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD); __assume_ctx_lock(l); } while (0) #define lockdep_assert_not_held(l) \ lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD) #define lockdep_assert_held_write(l) \ do { lockdep_assert(lockdep_is_held_type(l, 0)); __assume_ctx_lock(l); } while (0) #define lockdep_assert_held_read(l) \ do { lockdep_assert(lockdep_is_held_type(l, 1)); __assume_shared_ctx_lock(l); } while (0) #define lockdep_assert_held_once(l) \ lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD) #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. */ #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map _name = { \ .name = #_name "-wait-type-override", \ .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, key, s, i) do { (void)(key); } while (0) # define lock_set_novalidate_class(l, n, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) #define lockdep_set_notrack_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) /* * Dummy forward declarations, allow users to write less ifdef-y code * and depend on dead code elimination. */ extern int lock_is_held(const void *); extern int lockdep_is_held(const void *); #define lockdep_is_held_type(l, r) (1) #define lockdep_assert(c) do { } while (0) #define lockdep_assert_once(c) do { } while (0) #define lockdep_assert_held(l) __assume_ctx_lock(l) #define lockdep_assert_not_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) __assume_ctx_lock(l) #define lockdep_assert_held_read(l) __assume_shared_ctx_lock(l) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn); #define lock_set_cmp_fn(lock, ...) lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__) #else #define lock_set_cmp_fn(lock, ...) do { } while (0) #endif enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #define lock_map_sync(l) lock_sync(l, 0, 0, 1, NULL, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_no_hardirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) /* * Acceptable for protecting per-CPU resources accessed from BH. * Much like in_softirq() - semantics are ambiguous, use carefully. */ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_no_hardirq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) # define lockdep_assert_in_softirq() do { } while (0) # define lockdep_assert_in_softirq_func() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 /* * linux/fs/hfs/hfs_fs.h * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. */ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H #include <linux/slab.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/workqueue.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include "hfs.h" /* * struct hfs_inode_info * * The HFS-specific part of a Linux (struct inode) */ struct hfs_inode_info { atomic_t opencnt; unsigned int flags; /* to deal with localtime ugliness */ int tz_secondswest; struct hfs_cat_key cat_key; struct list_head open_dir_list; spinlock_t open_dir_lock; struct inode *rsrc_inode; struct mutex extents_lock; u16 alloc_blocks, clump_blocks; sector_t fs_blocks; /* Allocation extents from catlog record or volume header */ hfs_extent_rec first_extents; u16 first_blocks; hfs_extent_rec cached_extents; u16 cached_start, cached_blocks; loff_t phys_size; struct inode vfs_inode; }; #define HFS_FLG_RSRC 0x0001 #define HFS_FLG_EXT_DIRTY 0x0002 #define HFS_FLG_EXT_NEW 0x0004 #define HFS_IS_RSRC(inode) (HFS_I(inode)->flags & HFS_FLG_RSRC) /* * struct hfs_sb_info * * The HFS-specific part of a Linux (struct super_block) */ struct hfs_sb_info { struct buffer_head *mdb_bh; /* The hfs_buffer holding the real superblock (aka VIB or MDB) */ struct hfs_mdb *mdb; struct buffer_head *alt_mdb_bh; /* The hfs_buffer holding the alternate superblock */ struct hfs_mdb *alt_mdb; __be32 *bitmap; /* The page holding the allocation bitmap */ struct hfs_btree *ext_tree; /* Information about the extents b-tree */ struct hfs_btree *cat_tree; /* Information about the catalog b-tree */ atomic64_t file_count; /* The number of regular files in the filesystem */ atomic64_t folder_count; /* The number of directories in the filesystem */ atomic64_t next_id; /* The next available file id number */ u32 clumpablks; /* The number of allocation blocks to try to add when extending a file */ u32 fs_start; /* The first 512-byte block represented in the bitmap */ u32 part_start; u16 root_files; /* The number of regular (non-directory) files in the root directory */ u16 root_dirs; /* The number of directories in the root directory */ u16 fs_ablocks; /* The number of allocation blocks in the filesystem */ u16 free_ablocks; /* the number of unused allocation blocks in the filesystem */ u32 alloc_blksz; /* The size of an "allocation block" */ int s_quiet; /* Silent failure when changing owner or mode? */ __be32 s_type; /* Type for new files */ __be32 s_creator; /* Creator for new files */ umode_t s_file_umask; /* The umask applied to the permissions on all files */ umode_t s_dir_umask; /* The umask applied to the permissions on all dirs */ kuid_t s_uid; /* The uid of all files */ kgid_t s_gid; /* The gid of all files */ int session, part; struct nls_table *nls_io, *nls_disk; struct mutex bitmap_lock; unsigned long flags; u16 blockoffset; int fs_div; struct super_block *sb; int work_queued; /* non-zero delayed work is queued */ struct delayed_work mdb_work; /* MDB flush delayed work */ spinlock_t work_lock; /* protects mdb_work and work_queued */ }; #define HFS_FLG_BITMAP_DIRTY 0 #define HFS_FLG_MDB_DIRTY 1 #define HFS_FLG_ALT_MDB_DIRTY 2 /* bitmap.c */ extern u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits); extern int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count); /* catalog.c */ extern int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2); struct hfs_find_data; extern int hfs_cat_find_brec(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); extern int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct inode *inode); extern int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str); extern int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, struct inode *dst_dir, const struct qstr *dst_name); extern void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, const struct qstr *name); /* dir.c */ extern const struct file_operations hfs_dir_operations; extern const struct inode_operations hfs_dir_inode_operations; /* extent.c */ extern int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2); extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off); extern int hfs_free_fork(struct super_block *sb, struct hfs_cat_file *file, int type); extern int hfs_ext_write_extent(struct inode *inode); extern int hfs_extend_file(struct inode *inode); extern void hfs_file_truncate(struct inode *inode); extern int hfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); /* inode.c */ extern const struct address_space_operations hfs_aops; extern const struct address_space_operations hfs_btree_aops; int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned int len, struct folio **foliop, void **fsdata); extern struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t mode); extern void hfs_inode_write_fork(struct inode *inode, struct hfs_extent *ext, __be32 *log_size, __be32 *phys_size); extern int hfs_write_inode(struct inode *inode, struct writeback_control *wbc); extern int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 __log_size, __be32 phys_size, u32 clump_size); extern struct inode *hfs_iget(struct super_block *sb, struct hfs_cat_key *key, hfs_cat_rec *rec); extern void hfs_evict_inode(struct inode *inode); extern void hfs_delete_inode(struct inode *inode); /* attr.c */ extern const struct xattr_handler * const hfs_xattr_handlers[]; /* mdb.c */ extern bool is_hfs_cnid_counts_valid(struct super_block *sb); extern int hfs_mdb_get(struct super_block *sb); extern void hfs_mdb_commit(struct super_block *sb); extern void hfs_mdb_close(struct super_block *sb); extern void hfs_mdb_put(struct super_block *sb); /* part_tbl.c */ extern int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size); /* string.c */ extern const struct dentry_operations hfs_dentry_operations; extern int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this); extern int hfs_strcmp(const unsigned char *s1, unsigned int len1, const unsigned char *s2, unsigned int len2); extern int hfs_compare_dentry(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /* trans.c */ extern void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr *in); extern int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in); /* super.c */ extern void hfs_mark_mdb_dirty(struct super_block *sb); /* * There are two time systems. Both are based on seconds since * a particular time/date. * Unix: signed little-endian since 00:00 GMT, Jan. 1, 1970 * mac: unsigned big-endian since 00:00 GMT, Jan. 1, 1904 * * HFS implementations are highly inconsistent, this one matches the * traditional behavior of 64-bit Linux, giving the most useful * time range between 1970 and 2106, by treating any on-disk timestamp * under HFS_UTC_OFFSET (Jan 1 1970) as a time between 2040 and 2106. */ #define HFS_UTC_OFFSET 2082844800U static inline time64_t __hfs_m_to_utime(__be32 mt) { time64_t ut = (u32)(be32_to_cpu(mt) - HFS_UTC_OFFSET); return ut + sys_tz.tz_minuteswest * 60; } static inline __be32 __hfs_u_to_mtime(time64_t ut) { ut -= sys_tz.tz_minuteswest * 60; return cpu_to_be32(lower_32_bits(ut) + HFS_UTC_OFFSET); } #define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec64){ .tv_sec = __hfs_m_to_utime(time) } #define hfs_u_to_mtime(time) __hfs_u_to_mtime((time).tv_sec) #define hfs_mtime() __hfs_u_to_mtime(ktime_get_real_seconds()) static inline const char *hfs_mdb_name(struct super_block *sb) { return sb->s_id; } static inline void hfs_bitmap_dirty(struct super_block *sb) { set_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags); hfs_mark_mdb_dirty(sb); } #define sb_bread512(sb, sec, data) ({ \ struct buffer_head *__bh; \ sector_t __block; \ loff_t __start; \ int __offset; \ \ __start = (loff_t)(sec) << HFS_SECTOR_SIZE_BITS;\ __block = __start >> (sb)->s_blocksize_bits; \ __offset = __start & ((sb)->s_blocksize - 1); \ __bh = sb_bread((sb), __block); \ if (likely(__bh != NULL)) \ data = (void *)(__bh->b_data + __offset);\ else \ data = NULL; \ __bh; \ }) #endif
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // SPDX-License-Identifier: GPL-2.0 #ifndef _MEDIA_FRAME_VECTOR_H #define _MEDIA_FRAME_VECTOR_H /* Container for pinned pfns / pages in frame_vector.c */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; struct frame_vector *frame_vector_create(unsigned int nr_frames); void frame_vector_destroy(struct frame_vector *vec); int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, bool write, struct frame_vector *vec); void put_vaddr_frames(struct frame_vector *vec); int frame_vector_to_pages(struct frame_vector *vec); void frame_vector_to_pfns(struct frame_vector *vec); static inline unsigned int frame_vector_count(struct frame_vector *vec) { return vec->nr_frames; } static inline struct page **frame_vector_pages(struct frame_vector *vec) { if (vec->is_pfns) { int err = frame_vector_to_pages(vec); if (err) return ERR_PTR(err); } return (struct page **)(vec->ptrs); } static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) { if (!vec->is_pfns) frame_vector_to_pfns(vec); return (unsigned long *)(vec->ptrs); } #endif /* _MEDIA_FRAME_VECTOR_H */
2565 1802 1738 7 279 1 3 1670 38 1 1774 1675 1677 1957 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 // SPDX-License-Identifier: GPL-2.0 /* * device.h - generic, centralized driver model * * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2009 Novell Inc. * * See Documentation/driver-api/driver-model/ for more information. */ #ifndef _DEVICE_H_ #define _DEVICE_H_ #include <linux/dev_printk.h> #include <linux/energy_model.h> #include <linux/ioport.h> #include <linux/kobject.h> #include <linux/klist.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/compiler.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/gfp.h> #include <linux/device/bus.h> #include <linux/device/class.h> #include <linux/device/devres.h> #include <linux/device/driver.h> #include <linux/cleanup.h> #include <asm/device.h> struct device; struct device_private; struct device_driver; struct driver_private; struct module; struct class; struct subsys_private; struct device_node; struct fwnode_handle; struct iommu_group; struct dev_pin_info; struct dev_iommu; struct msi_device_data; /** * struct subsys_interface - interfaces to device functions * @name: name of the device function * @subsys: subsystem of the devices to attach to * @node: the list of functions registered at the subsystem * @add_dev: device hookup to device function handler * @remove_dev: device hookup to device function handler * * Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices. */ struct subsys_interface { const char *name; const struct bus_type *subsys; struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); void (*remove_dev)(struct device *dev, struct subsys_interface *sif); }; int subsys_interface_register(struct subsys_interface *sif); void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups); int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups); /* * The type of device, "struct device" is embedded in. A class * or bus can contain devices of different types * like "partitions" and "disks", "mouse" and "event". * This identifies the device type and carries type-specific * information, equivalent to the kobj_type of a kobject. * If "name" is specified, the uevent will contain it in * the DEVTYPE variable. */ struct device_type { const char *name; const struct attribute_group **groups; int (*uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid); void (*release)(struct device *dev); const struct dev_pm_ops *pm; }; /** * struct device_attribute - Interface for exporting device attributes. * @attr: sysfs attribute definition. * @show: Show handler. * @store: Store handler. */ struct device_attribute { struct attribute attr; ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); ssize_t (*store)(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); }; /** * struct dev_ext_attribute - Exported device attribute with extra context. * @attr: Exported device attribute. * @var: Pointer to context. */ struct dev_ext_attribute { struct device_attribute attr; void *var; }; ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_int(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_int(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_bool(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_bool(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_string(struct device *dev, struct device_attribute *attr, char *buf); /** * DEVICE_ATTR - Define a device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Convenience macro for defining a struct device_attribute. * * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to: * * .. code-block:: c * * struct device_attribute dev_attr_foo = { * .attr = { .name = "foo", .mode = 0644 }, * .show = foo_show, * .store = foo_store, * }; */ #define DEVICE_ATTR(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) /** * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode. */ #define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_PREALLOC(_name, _mode, _show, _store) /** * DEVICE_ATTR_RW - Define a read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show, * and @_store is <_name>_store. */ #define DEVICE_ATTR_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW(_name) /** * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RW(), but @_mode is 0600. */ #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) /** * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name * that differs from the function name. * @_name: Attribute function preface * @_attrname: Attribute name as it wil be exposed in the sysfs. * * Like DEVICE_ATTR_RW(), but allows for reusing names under separate paths in * the same driver. */ #define DEVICE_ATTR_RW_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0644 }, \ .show = _name##_show, \ .store = _name##_store, \ } /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show. */ #define DEVICE_ATTR_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO(_name) /** * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RO(), but @_mode is 0400. */ #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) /** * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name * that differs from the function name. * @_name: Attribute function preface * @_attrname: Attribute name as it wil be exposed in the sysfs. * * Like DEVICE_ATTR_RO(), but allows for reusing names under separate paths in * the same driver. */ #define DEVICE_ATTR_RO_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0444 }, \ .show = _name##_show, \ } /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store. */ #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) /** * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name * that differs from the function name. * @_name: Attribute function preface * @_attrname: Attribute name as it wil be exposed in the sysfs. * * Like DEVICE_ATTR_WO(), but allows for reusing names under separate paths in * the same driver. */ #define DEVICE_ATTR_WO_NAMED(_name, _attrname) \ struct device_attribute dev_attr_##_name = { \ .attr = { .name = _attrname, .mode = 0200 }, \ .store = _name##_store, \ } /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of unsigned long. * * Like DEVICE_ATTR(), but @_show and @_store are automatically provided * such that reads and writes to the attribute from userspace affect @_var. */ #define DEVICE_ULONG_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } /** * DEVICE_INT_ATTR - Define a device attribute backed by an int. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of int. * * Like DEVICE_ULONG_ATTR(), but @_var is an int. */ #define DEVICE_INT_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } /** * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of bool. * * Like DEVICE_ULONG_ATTR(), but @_var is a bool. */ #define DEVICE_BOOL_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } /** * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of string. * * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the * string allocation is unknown, the attribute must be read-only. */ #define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) } #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) int device_create_file(struct device *device, const struct device_attribute *entry); void device_remove_file(struct device *dev, const struct device_attribute *attr); bool device_remove_file_self(struct device *dev, const struct device_attribute *attr); int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); struct device_dma_parameters { /* * a low level driver may set these to teach IOMMU code about * sg limitations. */ unsigned int max_segment_size; unsigned int min_align_mask; unsigned long segment_boundary_mask; }; /** * enum device_link_state - Device link states. * @DL_STATE_NONE: The presence of the drivers is not being tracked. * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present. * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not. * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present). * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present. * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding. */ enum device_link_state { DL_STATE_NONE = -1, DL_STATE_DORMANT = 0, DL_STATE_AVAILABLE, DL_STATE_CONSUMER_PROBE, DL_STATE_ACTIVE, DL_STATE_SUPPLIER_UNBIND, }; /* * Device link flags. * * STATELESS: The core will not remove this link automatically. * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind. * PM_RUNTIME: If set, the runtime PM framework will use this link. * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation. * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind. * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds. * MANAGED: The core tracks presence of supplier/consumer drivers (internal). * SYNC_STATE_ONLY: Link only affects sync_state() behavior. * INFERRED: Inferred from data (eg: firmware) and not from driver actions. */ #define DL_FLAG_STATELESS BIT(0) #define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1) #define DL_FLAG_PM_RUNTIME BIT(2) #define DL_FLAG_RPM_ACTIVE BIT(3) #define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4) #define DL_FLAG_AUTOPROBE_CONSUMER BIT(5) #define DL_FLAG_MANAGED BIT(6) #define DL_FLAG_SYNC_STATE_ONLY BIT(7) #define DL_FLAG_INFERRED BIT(8) #define DL_FLAG_CYCLE BIT(9) /** * enum dl_dev_state - Device driver presence tracking information. * @DL_DEV_NO_DRIVER: There is no driver attached to the device. * @DL_DEV_PROBING: A driver is probing. * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device. * @DL_DEV_UNBINDING: The driver is unbinding from the device. */ enum dl_dev_state { DL_DEV_NO_DRIVER = 0, DL_DEV_PROBING, DL_DEV_DRIVER_BOUND, DL_DEV_UNBINDING, }; /** * enum device_removable - Whether the device is removable. The criteria for a * device to be classified as removable is determined by its subsystem or bus. * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this * device (default). * @DEVICE_REMOVABLE_UNKNOWN: Device location is Unknown. * @DEVICE_FIXED: Device is not removable by the user. * @DEVICE_REMOVABLE: Device is removable by the user. */ enum device_removable { DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */ DEVICE_REMOVABLE_UNKNOWN, DEVICE_FIXED, DEVICE_REMOVABLE, }; /** * struct dev_links_info - Device data related to device links. * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @defer_sync: Hook to global list of devices that have deferred sync_state. * @status: Driver status information. */ struct dev_links_info { struct list_head suppliers; struct list_head consumers; struct list_head defer_sync; enum dl_dev_state status; }; /** * struct dev_msi_info - Device data related to MSI * @domain: The MSI interrupt domain associated to the device * @data: Pointer to MSI device data */ struct dev_msi_info { #ifdef CONFIG_GENERIC_MSI_IRQ struct irq_domain *domain; struct msi_device_data *data; #endif }; /** * enum device_physical_location_panel - Describes which panel surface of the * system's housing the device connection point resides on. * @DEVICE_PANEL_TOP: Device connection point is on the top panel. * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel. * @DEVICE_PANEL_LEFT: Device connection point is on the left panel. * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel. * @DEVICE_PANEL_FRONT: Device connection point is on the front panel. * @DEVICE_PANEL_BACK: Device connection point is on the back panel. * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown. */ enum device_physical_location_panel { DEVICE_PANEL_TOP, DEVICE_PANEL_BOTTOM, DEVICE_PANEL_LEFT, DEVICE_PANEL_RIGHT, DEVICE_PANEL_FRONT, DEVICE_PANEL_BACK, DEVICE_PANEL_UNKNOWN, }; /** * enum device_physical_location_vertical_position - Describes vertical * position of the device connection point on the panel surface. * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel. * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel. */ enum device_physical_location_vertical_position { DEVICE_VERT_POS_UPPER, DEVICE_VERT_POS_CENTER, DEVICE_VERT_POS_LOWER, }; /** * enum device_physical_location_horizontal_position - Describes horizontal * position of the device connection point on the panel surface. * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel. * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel. */ enum device_physical_location_horizontal_position { DEVICE_HORI_POS_LEFT, DEVICE_HORI_POS_CENTER, DEVICE_HORI_POS_RIGHT, }; /** * struct device_physical_location - Device data related to physical location * of the device connection point. * @panel: Panel surface of the system's housing that the device connection * point resides on. * @vertical_position: Vertical position of the device connection point within * the panel. * @horizontal_position: Horizontal position of the device connection point * within the panel. * @dock: Set if the device connection point resides in a docking station or * port replicator. * @lid: Set if this device connection point resides on the lid of laptop * system. */ struct device_physical_location { enum device_physical_location_panel panel; enum device_physical_location_vertical_position vertical_position; enum device_physical_location_horizontal_position horizontal_position; bool dock; bool lid; }; /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. * In most cases, a parent device is some sort of bus or host * controller. If parent is NULL, the device, is a top-level device, * which is not usually what you want. * @p: Holds the private data of the driver core portions of the device. * See the comment of the struct device_private for detail. * @kobj: A top-level, abstract class from which other classes are derived. * @init_name: Initial name of the device. * @type: The type of device. * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. * Example: For devices on custom boards, as typical of embedded * and SOC based hardware, Linux often uses platform_data to point * to board-specific structures describing devices and how they * are wired. That can include what ports are available, chip * variants, which GPIO pins act in what additional roles, and so * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. * @msi: MSI related data * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller * DMA limit than the device itself supports. * @dma_range_map: map for DMA memory ranges relative to that of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations * @dma_io_tlb_mem: Software IO TLB allocator. Not for driver use. * @dma_io_tlb_pools: List of transient swiotlb memory pools. * @dma_io_tlb_lock: Protects changes to the list of active pools. * @dma_uses_io_tlb: %true if device has used the software IO TLB. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. * @devt: For creating the sysfs "dev". * @id: device instance * @devres_lock: Spinlock to protect the resource of the device. * @devres_head: The resources list of the device. * @class: The class of the device. * @groups: Optional attribute groups. * @release: Callback to free the device after all references have * gone away. This should be set by the allocator of the * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu: Per device generic IOMMU runtime data * @physical_location: Describes physical location of the device connection * point in the system housing. * @removable: Whether the device can be removed from the system. This * should be set by the subsystem / bus driver that discovered * the device. * * @offline_disabled: If set, the device is permanently online. * @offline: Set after successful invocation of bus type's .offline(). * @of_node_reused: Set if the device-tree node is shared with an ancestor * device. * @state_synced: The hardware state of this device has been synced to match * the software state of this device by calling the driver/bus * sync_state() callback. * @can_match: The device has matched with a driver at least once or it is in * a bus (like AMBA) which can't check for matching drivers until * other devices probe successfully. * @dma_coherent: this particular device is dma coherent, even if the * architecture supports non-coherent devices. * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. * @dma_iommu: Device is using default IOMMU implementation for DMA and * doesn't rely on dma_ops structure. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information * that the device model core needs to model the system. Most subsystems, * however, track additional information about the devices they host. As a * result, it is rare for devices to be represented by bare device structures; * instead, that structure, like kobject structures, is usually embedded within * a higher-level representation of the device. */ struct device { struct kobject kobj; struct device *parent; struct device_private *p; const char *init_name; /* initial name of the device */ const struct device_type *type; const struct bus_type *bus; /* type of bus device is on */ struct device_driver *driver; /* which driver has allocated this device */ void *platform_data; /* Platform specific data, device core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ struct mutex mutex; /* mutex to synchronize calls to * its driver. */ struct dev_links_info links; struct dev_pm_info power; struct dev_pm_domain *pm_domain; #ifdef CONFIG_ENERGY_MODEL struct em_perf_domain *em_pd; #endif #ifdef CONFIG_PINCTRL struct dev_pin_info *pins; #endif struct dev_msi_info msi; #ifdef CONFIG_ARCH_HAS_DMA_OPS const struct dma_map_ops *dma_ops; #endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ u64 bus_dma_limit; /* upstream dma constraint */ const struct bus_dma_region *dma_range_map; struct device_dma_parameters *dma_parms; struct list_head dma_pools; /* dma pools (if dma'ble) */ #ifdef CONFIG_DMA_DECLARE_COHERENT struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */ #endif #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ #endif #ifdef CONFIG_SWIOTLB struct io_tlb_mem *dma_io_tlb_mem; #endif #ifdef CONFIG_SWIOTLB_DYNAMIC struct list_head dma_io_tlb_pools; spinlock_t dma_io_tlb_lock; bool dma_uses_io_tlb; #endif /* arch specific additions */ struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ struct fwnode_handle *fwnode; /* firmware device node */ #ifdef CONFIG_NUMA int numa_node; /* NUMA node this device is close to */ #endif dev_t devt; /* dev_t, creates the sysfs "dev" */ u32 id; /* device instance */ spinlock_t devres_lock; struct list_head devres_head; const struct class *class; const struct attribute_group **groups; /* optional groups */ void (*release)(struct device *dev); struct iommu_group *iommu_group; struct dev_iommu *iommu; struct device_physical_location *physical_location; enum device_removable removable; bool offline_disabled:1; bool offline:1; bool of_node_reused:1; bool state_synced:1; bool can_match:1; #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_coherent:1; #endif #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif #ifdef CONFIG_DMA_NEED_SYNC bool dma_skip_sync:1; #endif #ifdef CONFIG_IOMMU_DMA bool dma_iommu:1; #endif }; /** * struct device_link - Device link representation. * @supplier: The device on the supplier end of the link. * @s_node: Hook to the supplier device's list of links to consumers. * @consumer: The device on the consumer end of the link. * @c_node: Hook to the consumer device's list of links to suppliers. * @link_dev: device used to expose link details in sysfs * @status: The state of the link (with respect to the presence of drivers). * @flags: Link flags. * @rpm_active: Whether or not the consumer device is runtime-PM-active. * @kref: Count repeated addition of the same link. * @rm_work: Work structure used for removing the link. * @supplier_preactivated: Supplier has been made active before consumer probe. */ struct device_link { struct device *supplier; struct list_head s_node; struct device *consumer; struct list_head c_node; struct device link_dev; enum device_link_state status; u32 flags; refcount_t rpm_active; struct kref kref; struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ }; #define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU * @dev: Device to perform the check on */ static inline bool device_iommu_mapped(struct device *dev) { return (dev->iommu_group != NULL); } /* Get the wakeup routines, which depend on struct device */ #include <linux/pm_wakeup.h> /** * dev_name - Return a device's name. * @dev: Device with name to get. * Return: The kobject name of the device, or its initial name if unavailable. */ static inline const char *dev_name(const struct device *dev) { /* Use the init name until the kobject becomes available */ if (dev->init_name) return dev->init_name; return kobject_name(&dev->kobj); } /** * dev_bus_name - Return a device's bus/class name, if at all possible * @dev: struct device to get the bus/class name of * * Will return the name of the bus/class the device is attached to. If it is * not attached to a bus/class, an empty string will be returned. */ static inline const char *dev_bus_name(const struct device *dev) { return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : ""); } __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { return dev->numa_node; } static inline void set_dev_node(struct device *dev, int node) { dev->numa_node = node; } #else static inline int dev_to_node(struct device *dev) { return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { } #endif static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { #ifdef CONFIG_GENERIC_MSI_IRQ return dev->msi.domain; #else return NULL; #endif } static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { #ifdef CONFIG_GENERIC_MSI_IRQ dev->msi.domain = d; #endif } static inline void *dev_get_drvdata(const struct device *dev) { return dev->driver_data; } static inline void dev_set_drvdata(struct device *dev, void *data) { dev->driver_data = data; } static inline struct pm_subsys_data *dev_to_psd(struct device *dev) { return dev ? dev->power.subsys_data : NULL; } static inline unsigned int dev_get_uevent_suppress(const struct device *dev) { return dev->kobj.uevent_suppress; } static inline void dev_set_uevent_suppress(struct device *dev, int val) { dev->kobj.uevent_suppress = val; } static inline int device_is_registered(struct device *dev) { return dev->kobj.state_in_sysfs; } static inline void device_enable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = true; } static inline void device_disable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = false; } static inline bool device_async_suspend_enabled(struct device *dev) { return !!dev->power.async_suspend; } static inline bool device_pm_not_required(struct device *dev) { return dev->power.no_pm; } static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; #ifdef CONFIG_PM dev->power.no_callbacks = true; #endif } static inline void dev_pm_syscore_device(struct device *dev, bool val) { #ifdef CONFIG_PM_SLEEP dev->power.syscore = val; #endif } static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags) { dev->power.driver_flags = flags; } static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) { return !!(dev->power.driver_flags & flags); } static inline bool dev_pm_smart_suspend(struct device *dev) { #ifdef CONFIG_PM_SLEEP return dev->power.smart_suspend; #else return false; #endif } /* * dev_pm_set_strict_midlayer - Update the device's power.strict_midlayer flag * @dev: Target device. * @val: New flag value. * * When set, power.strict_midlayer means that the middle layer power management * code (typically, a bus type or a PM domain) does not expect its runtime PM * suspend callback to be invoked at all during system-wide PM transitions and * it does not expect its runtime PM resume callback to be invoked at any point * when runtime PM is disabled for the device during system-wide PM transitions. */ static inline void dev_pm_set_strict_midlayer(struct device *dev, bool val) { #ifdef CONFIG_PM_SLEEP dev->power.strict_midlayer = val; #endif } static inline bool dev_pm_strict_midlayer_is_set(struct device *dev) { #ifdef CONFIG_PM_SLEEP return dev->power.strict_midlayer; #else return false; #endif } static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); } static inline int device_lock_interruptible(struct device *dev) { return mutex_lock_interruptible(&dev->mutex); } static inline int device_trylock(struct device *dev) { return mutex_trylock(&dev->mutex); } static inline void device_unlock(struct device *dev) { mutex_unlock(&dev->mutex); } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) static inline void device_lock_assert(struct device *dev) { lockdep_assert_held(&dev->mutex); } static inline bool dev_has_sync_state(struct device *dev) { if (!dev) return false; if (dev->driver && dev->driver->sync_state) return true; if (dev->bus && dev->bus->sync_state) return true; return false; } static inline int dev_set_drv_sync_state(struct device *dev, void (*fn)(struct device *dev)) { if (!dev || !dev->driver) return 0; if (dev->driver->sync_state && dev->driver->sync_state != fn) return -EBUSY; if (!dev->driver->sync_state) dev->driver->sync_state = fn; return 0; } static inline void dev_set_removable(struct device *dev, enum device_removable removable) { dev->removable = removable; } static inline bool dev_is_removable(struct device *dev) { return dev->removable == DEVICE_REMOVABLE; } static inline bool dev_removable_is_valid(struct device *dev) { return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED; } /* * High level routines for use by the bus drivers */ int __must_check device_register(struct device *dev); void device_unregister(struct device *dev); void device_initialize(struct device *dev); int __must_check device_add(struct device *dev); void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) int device_for_each_child(struct device *parent, void *data, device_iter_t fn); int device_for_each_child_reverse(struct device *parent, void *data, device_iter_t fn); int device_for_each_child_reverse_from(struct device *parent, struct device *from, void *data, device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); /** * device_find_child_by_name - device iterator for locating a child device. * @parent: parent struct device * @name: name of the child device * * This is similar to the device_find_child() function above, but it * returns a reference to a device that has the name @name. * * NOTE: you will need to drop the reference with put_device() after use. */ static inline struct device *device_find_child_by_name(struct device *parent, const char *name) { return device_find_child(parent, name, device_match_name); } /** * device_find_any_child - device iterator for locating a child device, if any. * @parent: parent struct device * * This is similar to the device_find_child() function above, but it * returns a reference to a child device, if any. * * NOTE: you will need to drop the reference with put_device() after use. */ static inline struct device *device_find_any_child(struct device *parent) { return device_find_child(parent, NULL, device_match_any); } int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); static inline bool device_supports_offline(struct device *dev) { return dev->bus && dev->bus->offline && dev->bus->online; } #define __device_lock_set_class(dev, name, key) \ do { \ struct device *__d2 __maybe_unused = dev; \ lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \ } while (0) /** * device_lock_set_class - Specify a temporary lock class while a device * is attached to a driver * @dev: device to modify * @key: lock class key data * * This must be called with the device_lock() already held, for example * from driver ->probe(). Take care to only override the default * lockdep_no_validate class. */ #ifdef CONFIG_LOCKDEP #define device_lock_set_class(dev, key) \ do { \ struct device *__d = dev; \ dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex, \ &__lockdep_no_validate__), \ "overriding existing custom lock class\n"); \ __device_lock_set_class(__d, #key, key); \ } while (0) #else #define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key) #endif /** * device_lock_reset_class - Return a device to the default lockdep novalidate state * @dev: device to modify * * This must be called with the device_lock() already held, for example * from driver ->remove(). */ #define device_lock_reset_class(dev) \ do { \ struct device *__d __maybe_unused = dev; \ lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex", \ _THIS_IP_); \ } while (0) void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_node(struct device *dev, struct fwnode_handle *fwnode); int device_add_of_node(struct device *dev, struct device_node *of_node); void device_remove_of_node(struct device *dev); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); struct device *get_dev_from_fwnode(struct fwnode_handle *fwnode); static inline struct device_node *dev_of_node(struct device *dev) { if (!IS_ENABLED(CONFIG_OF) || !dev) return NULL; return dev->of_node; } static inline int dev_num_vf(struct device *dev) { if (dev->bus && dev->bus->num_vf) return dev->bus->num_vf(dev); return 0; } /* * Root device objects for grouping under /sys/devices */ struct device *__root_device_register(const char *name, struct module *owner); /* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) void root_device_unregister(struct device *root); static inline void *dev_get_platdata(const struct device *dev) { return dev->platform_data; } /* * Manual binding of a device to driver. See drivers/base/bus.c * for information on use. */ int __must_check device_driver_attach(const struct device_driver *drv, struct device *dev); int __must_check device_bind_driver(struct device *dev); void device_release_driver(struct device *dev); int __must_check device_attach(struct device *dev); int __must_check driver_attach(const struct device_driver *drv); void device_initial_probe(struct device *dev); int __must_check device_reprobe(struct device *dev); bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ __printf(5, 6) struct device * device_create(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...); __printf(6, 7) struct device * device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, const struct attribute_group **groups); void device_remove_groups(struct device *dev, const struct attribute_group **groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_add_groups(dev, groups); } static inline void device_remove_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; device_remove_groups(dev, groups); } int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); /* * get_device - atomically increment the reference count for the device. * */ struct device *get_device(struct device *dev); void put_device(struct device *dev); DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T)) bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_mount(void); #else static inline int devtmpfs_mount(void) { return 0; } #endif /* drivers/base/power/shutdown.c */ void device_shutdown(void); /* debugging and troubleshooting/diagnostic helpers. */ const char *dev_driver_string(const struct device *dev); /* Device links interface. */ struct device_link *device_link_add(struct device *consumer, struct device *supplier, u32 flags); void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); void device_link_wait_removal(void); static inline bool device_link_test(const struct device_link *link, u32 flags) { return !!(link->flags & flags); } /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_CHARDEV_MAJOR(major) \ MODULE_ALIAS("char-major-" __stringify(major) "-*") #endif /* _DEVICE_H_ */
1182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __VDSO_MATH64_H #define __VDSO_MATH64_H static __always_inline u32 __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { u32 ret = 0; while (dividend >= divisor) { /* The following asm() prevents the compiler from optimising this loop into a modulo operation. */ asm("" : "+rm"(dividend)); dividend -= divisor; ret++; } *remainder = dividend; return ret; } #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_add_u64_shr static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { return (u64)((((unsigned __int128)a * mul) + b) >> shift); } #endif /* mul_u64_u32_add_u64_shr */ #else #ifndef mul_u64_u32_add_u64_shr #ifndef mul_u32_u32 static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #define mul_u32_u32 mul_u32_u32 #endif static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift) { u32 ah = a >> 32, al = a; bool ovf; u64 ret; ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret); ret >>= shift; if (ovf && shift) ret += 1ULL << (64 - shift); if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_add_u64_shr */ #endif #endif /* __VDSO_MATH64_H */
1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Mayflash game controller adapters. * * These devices are manufactured by Mayflash but identify themselves * using the vendor ID of DragonRise Inc. * * Tested with: * 0079:1801 "DragonRise Inc. Mayflash PS3 Game Controller Adapter" * 0079:1803 "DragonRise Inc. Mayflash Wireless Sensor DolphinBar" * 0079:1843 "DragonRise Inc. Mayflash GameCube Game Controller Adapter" * 0079:1844 "DragonRise Inc. Mayflash GameCube Game Controller Adapter (v04)" * * The following adapters probably work too, but need to be tested: * 0079:1800 "DragonRise Inc. Mayflash WIIU Game Controller Adapter" * * Copyright (c) 2016-2017 Marcel Hasler <mahasler@gmail.com> */ /* */ #include <linux/input.h> #include <linux/slab.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" struct mf_device { struct hid_report *report; }; static int mf_play(struct input_dev *dev, void *data, struct ff_effect *effect) { struct hid_device *hid = input_get_drvdata(dev); struct mf_device *mf = data; int strong, weak; strong = effect->u.rumble.strong_magnitude; weak = effect->u.rumble.weak_magnitude; dbg_hid("Called with 0x%04x 0x%04x.\n", strong, weak); strong = strong * 0xff / 0xffff; weak = weak * 0xff / 0xffff; dbg_hid("Running with 0x%02x 0x%02x.\n", strong, weak); mf->report->field[0]->value[0] = weak; mf->report->field[0]->value[1] = strong; hid_hw_request(hid, mf->report, HID_REQ_SET_REPORT); return 0; } static int mf_init(struct hid_device *hid) { struct mf_device *mf; struct list_head *report_list = &hid->report_enum[HID_OUTPUT_REPORT].report_list; struct list_head *report_ptr; struct hid_report *report; struct list_head *input_ptr = &hid->inputs; struct hid_input *input; struct input_dev *dev; int error; /* Setup each of the four inputs */ list_for_each(report_ptr, report_list) { report = list_entry(report_ptr, struct hid_report, list); if (report->maxfield < 1 || report->field[0]->report_count < 2) { hid_err(hid, "Invalid report, this should never happen!\n"); return -ENODEV; } if (list_is_last(input_ptr, &hid->inputs)) { hid_err(hid, "Missing input, this should never happen!\n"); return -ENODEV; } input_ptr = input_ptr->next; input = list_entry(input_ptr, struct hid_input, list); mf = kzalloc_obj(struct mf_device); if (!mf) return -ENOMEM; dev = input->input; set_bit(FF_RUMBLE, dev->ffbit); error = input_ff_create_memless(dev, mf, mf_play); if (error) { kfree(mf); return error; } mf->report = report; mf->report->field[0]->value[0] = 0x00; mf->report->field[0]->value[1] = 0x00; hid_hw_request(hid, mf->report, HID_REQ_SET_REPORT); } hid_info(hid, "Force feedback for HJZ Mayflash game controller " "adapters by Marcel Hasler <mahasler@gmail.com>\n"); return 0; } static int mf_probe(struct hid_device *hid, const struct hid_device_id *id) { int error; dev_dbg(&hid->dev, "Mayflash HID hardware probe...\n"); /* Apply quirks as needed */ hid->quirks |= id->driver_data; error = hid_parse(hid); if (error) { hid_err(hid, "HID parse failed.\n"); return error; } error = hid_hw_start(hid, HID_CONNECT_DEFAULT & ~HID_CONNECT_FF); if (error) { hid_err(hid, "HID hw start failed\n"); return error; } error = mf_init(hid); if (error) { hid_err(hid, "Force feedback init failed.\n"); hid_hw_stop(hid); return error; } return 0; } static const struct hid_device_id mf_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_PS3), .driver_data = HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_DOLPHINBAR), .driver_data = HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE1), .driver_data = HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE2), .driver_data = 0 }, /* No quirk required */ { HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_GAMECUBE3), .driver_data = HID_QUIRK_MULTI_INPUT }, { } }; MODULE_DEVICE_TABLE(hid, mf_devices); static struct hid_driver mf_driver = { .name = "hid_mf", .id_table = mf_devices, .probe = mf_probe, }; module_hid_driver(mf_driver); MODULE_DESCRIPTION("Force feedback support for Mayflash game controller adapters."); MODULE_LICENSE("GPL");
900 29650 558 29663 353 353 1141 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_FILE_REF_H #define _LINUX_FILE_REF_H #include <linux/atomic.h> #include <linux/preempt.h> #include <linux/types.h> /* * file_ref is a reference count implementation specifically for use by * files. It takes inspiration from rcuref but differs in key aspects * such as support for SLAB_TYPESAFE_BY_RCU type caches. * * FILE_REF_ONEREF FILE_REF_MAXREF * 0x0000000000000000UL 0x7FFFFFFFFFFFFFFFUL * <-------------------valid -------------------> * * FILE_REF_SATURATED * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL * <-----------------------saturation zone----------------------> * * FILE_REF_RELEASED FILE_REF_DEAD * 0xC000000000000000UL 0xE000000000000000UL * <-------------------dead zone-------------------> * * FILE_REF_NOREF * 0xFFFFFFFFFFFFFFFFUL */ #ifdef CONFIG_64BIT #define FILE_REF_ONEREF 0x0000000000000000UL #define FILE_REF_MAXREF 0x7FFFFFFFFFFFFFFFUL #define FILE_REF_SATURATED 0xA000000000000000UL #define FILE_REF_RELEASED 0xC000000000000000UL #define FILE_REF_DEAD 0xE000000000000000UL #define FILE_REF_NOREF 0xFFFFFFFFFFFFFFFFUL #else #define FILE_REF_ONEREF 0x00000000U #define FILE_REF_MAXREF 0x7FFFFFFFU #define FILE_REF_SATURATED 0xA0000000U #define FILE_REF_RELEASED 0xC0000000U #define FILE_REF_DEAD 0xE0000000U #define FILE_REF_NOREF 0xFFFFFFFFU #endif typedef struct { #ifdef CONFIG_64BIT atomic64_t refcnt; #else atomic_t refcnt; #endif } file_ref_t; /** * file_ref_init - Initialize a file reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) { atomic_long_set(&ref->refcnt, cnt - 1); } bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** * file_ref_get - Acquire one reference on a file * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF. * * Provides full memory ordering. * * Return: False if the attempt to acquire a reference failed. This happens * when the last reference has been put already. True if a reference * was successfully acquired */ static __always_inline __must_check bool file_ref_get(file_ref_t *ref) { /* * Unconditionally increase the reference count with full * ordering. The saturation and dead zones provide enough * tolerance for this. * * If this indicates negative the file in question the fail can * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU. * Hence, unconditionally altering the file reference count to * e.g., reset the file reference count back to the middle of * the deadzone risk end up marking someone else's file as dead * behind their back. * * It would be possible to do a careful: * * cnt = atomic_long_inc_return(); * if (likely(cnt >= 0)) * return true; * * and then something like: * * if (cnt >= FILE_REF_RELEASE) * atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD), * * to set the value back to the middle of the deadzone. But it's * practically impossible to go from FILE_REF_DEAD to * FILE_REF_ONEREF. It would need 2305843009213693952/2^61 * file_ref_get()s to resurrect such a dead file. */ return !atomic_long_add_negative(1, &ref->refcnt); } /** * file_ref_inc - Acquire one reference on a file * @ref: Pointer to the reference count * * Acquire an additional reference on a file. Warns if the caller didn't * already hold a reference. */ static __always_inline void file_ref_inc(file_ref_t *ref) { long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt); WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference"); } /** * file_ref_put -- Release a file reference * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores * are done before, and provides an acquire ordering on success such * that free() must come after. * * Return: True if this was the last reference with no future references * possible. This signals the caller that it can safely release * the object which is protected by the reference counter. * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) { long cnt; /* * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put() * calls don't risk UAFs when a file is recyclyed, it is still * vulnerable to UAFs caused by freeing the whole slab page once * it becomes unused. Prevent file_ref_put() from being * preempted protects against this. */ guard(preempt)(); /* * Unconditionally decrease the reference count. The saturation * and dead zones provide enough tolerance for this. If this * fails then we need to handle the last reference drop and * cases inside the saturation and dead zones. */ cnt = atomic_long_dec_return(&ref->refcnt); if (cnt >= 0) return false; return __file_ref_put(ref, cnt); } /** * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF * @ref: Pointer to the reference count * * Semantically it is equivalent to calling file_ref_put(), but it trades lower * performance in face of other CPUs also modifying the refcount for higher * performance when this happens to be the last reference. * * For the last reference file_ref_put() issues 2 atomics. One to drop the * reference and another to transition it to FILE_REF_DEAD. This routine does * the work in one step, but in order to do it has to pre-read the variable which * decreases scalability. * * Use with close() et al, stick to file_ref_put() by default. */ static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) { long old; old = atomic_long_read(&ref->refcnt); if (likely(old == FILE_REF_ONEREF)) { if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) return true; } return file_ref_put(ref); } /** * file_ref_read - Read the number of file references * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N) */ static inline unsigned long file_ref_read(file_ref_t *ref) { unsigned long c = atomic_long_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= FILE_REF_RELEASED ? 0 : c + 1; } /* * __file_ref_read_raw - Return the value stored in ref->refcnt * @ref: Pointer to the reference count * * Return: The raw value found in the counter * * A hack for file_needs_f_pos_lock(), you probably want to use * file_ref_read() instead. */ static inline unsigned long __file_ref_read_raw(file_ref_t *ref) { return atomic_long_read(&ref->refcnt); } #endif
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 // SPDX-License-Identifier: GPL-2.0-only /* * i8042 keyboard and mouse controller driver for Linux * * Copyright (c) 1999-2004 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/serio.h> #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/platform_device.h> #include <linux/i8042.h> #include <linux/slab.h> #include <linux/suspend.h> #include <linux/property.h> #include <asm/io.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION("i8042 keyboard and mouse controller driver"); MODULE_LICENSE("GPL"); static bool i8042_nokbd; module_param_named(nokbd, i8042_nokbd, bool, 0); MODULE_PARM_DESC(nokbd, "Do not probe or use KBD port."); static bool i8042_noaux; module_param_named(noaux, i8042_noaux, bool, 0); MODULE_PARM_DESC(noaux, "Do not probe or use AUX (mouse) port."); static bool i8042_nomux; module_param_named(nomux, i8042_nomux, bool, 0); MODULE_PARM_DESC(nomux, "Do not check whether an active multiplexing controller is present."); static bool i8042_unlock; module_param_named(unlock, i8042_unlock, bool, 0); MODULE_PARM_DESC(unlock, "Ignore keyboard lock."); static bool i8042_probe_defer; module_param_named(probe_defer, i8042_probe_defer, bool, 0); MODULE_PARM_DESC(probe_defer, "Allow deferred probing."); enum i8042_controller_reset_mode { I8042_RESET_NEVER, I8042_RESET_ALWAYS, I8042_RESET_ON_S2RAM, #define I8042_RESET_DEFAULT I8042_RESET_ON_S2RAM }; static enum i8042_controller_reset_mode i8042_reset = I8042_RESET_DEFAULT; static int i8042_set_reset(const char *val, const struct kernel_param *kp) { enum i8042_controller_reset_mode *arg = kp->arg; int error; bool reset; if (val) { error = kstrtobool(val, &reset); if (error) return error; } else { reset = true; } *arg = reset ? I8042_RESET_ALWAYS : I8042_RESET_NEVER; return 0; } static const struct kernel_param_ops param_ops_reset_param = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = i8042_set_reset, }; #define param_check_reset_param(name, p) \ __param_check(name, p, enum i8042_controller_reset_mode) module_param_named(reset, i8042_reset, reset_param, 0); MODULE_PARM_DESC(reset, "Reset controller on resume, cleanup or both"); static bool i8042_direct; module_param_named(direct, i8042_direct, bool, 0); MODULE_PARM_DESC(direct, "Put keyboard port into non-translated mode."); static bool i8042_dumbkbd; module_param_named(dumbkbd, i8042_dumbkbd, bool, 0); MODULE_PARM_DESC(dumbkbd, "Pretend that controller can only read data from keyboard"); static bool i8042_noloop; module_param_named(noloop, i8042_noloop, bool, 0); MODULE_PARM_DESC(noloop, "Disable the AUX Loopback command while probing for the AUX port"); static bool i8042_notimeout; module_param_named(notimeout, i8042_notimeout, bool, 0); MODULE_PARM_DESC(notimeout, "Ignore timeouts signalled by i8042"); static bool i8042_kbdreset; module_param_named(kbdreset, i8042_kbdreset, bool, 0); MODULE_PARM_DESC(kbdreset, "Reset device connected to KBD port"); #ifdef CONFIG_X86 static bool i8042_dritek; module_param_named(dritek, i8042_dritek, bool, 0); MODULE_PARM_DESC(dritek, "Force enable the Dritek keyboard extension"); #endif #ifdef CONFIG_PNP static bool i8042_nopnp; module_param_named(nopnp, i8042_nopnp, bool, 0); MODULE_PARM_DESC(nopnp, "Do not use PNP to detect controller settings"); #endif static bool i8042_forcenorestore; module_param_named(forcenorestore, i8042_forcenorestore, bool, 0); MODULE_PARM_DESC(forcenorestore, "Force no restore on s3 resume, copying s2idle behaviour"); #define DEBUG #ifdef DEBUG static bool i8042_debug; module_param_named(debug, i8042_debug, bool, 0600); MODULE_PARM_DESC(debug, "Turn i8042 debugging mode on and off"); static bool i8042_unmask_kbd_data; module_param_named(unmask_kbd_data, i8042_unmask_kbd_data, bool, 0600); MODULE_PARM_DESC(unmask_kbd_data, "Unconditional enable (may reveal sensitive data) of normally sanitize-filtered kbd data traffic debug log [pre-condition: i8042.debug=1 enabled]"); #endif static bool i8042_present; static bool i8042_bypass_aux_irq_test; static char i8042_kbd_firmware_id[128]; static char i8042_aux_firmware_id[128]; static struct fwnode_handle *i8042_kbd_fwnode; #include "i8042.h" /* * i8042_lock protects serialization between i8042_command and * the interrupt handler. */ static DEFINE_SPINLOCK(i8042_lock); /* * Writers to AUX and KBD ports as well as users issuing i8042_command * directly should acquire i8042_mutex (by means of calling * i8042_lock_chip() and i8042_unlock_chip() helpers) to ensure that * they do not disturb each other (unfortunately in many i8042 * implementations write to one of the ports will immediately abort * command that is being processed by another port). */ static DEFINE_MUTEX(i8042_mutex); struct i8042_port { struct serio *serio; int irq; bool exists; bool driver_bound; signed char mux; }; #define I8042_KBD_PORT_NO 0 #define I8042_AUX_PORT_NO 1 #define I8042_MUX_PORT_NO 2 #define I8042_NUM_PORTS (I8042_NUM_MUX_PORTS + 2) static struct i8042_port i8042_ports[I8042_NUM_PORTS]; static unsigned char i8042_initial_ctr; static unsigned char i8042_ctr; static bool i8042_mux_present; static bool i8042_kbd_irq_registered; static bool i8042_aux_irq_registered; static unsigned char i8042_suppress_kbd_ack; static struct platform_device *i8042_platform_device; static struct notifier_block i8042_kbd_bind_notifier_block; static bool i8042_handle_data(int irq); static i8042_filter_t i8042_platform_filter; static void *i8042_platform_filter_context; void i8042_lock_chip(void) { mutex_lock(&i8042_mutex); } EXPORT_SYMBOL(i8042_lock_chip); void i8042_unlock_chip(void) { mutex_unlock(&i8042_mutex); } EXPORT_SYMBOL(i8042_unlock_chip); int i8042_install_filter(i8042_filter_t filter, void *context) { guard(spinlock_irqsave)(&i8042_lock); if (i8042_platform_filter) return -EBUSY; i8042_platform_filter = filter; i8042_platform_filter_context = context; return 0; } EXPORT_SYMBOL(i8042_install_filter); int i8042_remove_filter(i8042_filter_t filter) { guard(spinlock_irqsave)(&i8042_lock); if (i8042_platform_filter != filter) return -EINVAL; i8042_platform_filter = NULL; i8042_platform_filter_context = NULL; return 0; } EXPORT_SYMBOL(i8042_remove_filter); /* * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to * be ready for reading values from it / writing values to it. * Called always with i8042_lock held. */ static int i8042_wait_read(void) { int i = 0; while ((~i8042_read_status() & I8042_STR_OBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } static int i8042_wait_write(void) { int i = 0; while ((i8042_read_status() & I8042_STR_IBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } /* * i8042_flush() flushes all data that may be in the keyboard and mouse buffers * of the i8042 down the toilet. */ static int i8042_flush(void) { unsigned char data, str; int count = 0; guard(spinlock_irqsave)(&i8042_lock); while ((str = i8042_read_status()) & I8042_STR_OBF) { if (count++ >= I8042_BUFFER_SIZE) return -EIO; udelay(50); data = i8042_read_data(); dbg("%02x <- i8042 (flush, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); } return 0; } /* * i8042_command() executes a command on the i8042. It also sends the input * parameter(s) of the commands to it, and receives the output value(s). The * parameters are to be stored in the param array, and the output is placed * into the same array. The number of the parameters and output values is * encoded in bits 8-11 of the command number. */ static int __i8042_command(unsigned char *param, int command) { int i, error; if (i8042_noloop && command == I8042_CMD_AUX_LOOP) return -1; error = i8042_wait_write(); if (error) return error; dbg("%02x -> i8042 (command)\n", command & 0xff); i8042_write_command(command & 0xff); for (i = 0; i < ((command >> 12) & 0xf); i++) { error = i8042_wait_write(); if (error) { dbg(" -- i8042 (wait write timeout)\n"); return error; } dbg("%02x -> i8042 (parameter)\n", param[i]); i8042_write_data(param[i]); } for (i = 0; i < ((command >> 8) & 0xf); i++) { error = i8042_wait_read(); if (error) { dbg(" -- i8042 (wait read timeout)\n"); return error; } if (command == I8042_CMD_AUX_LOOP && !(i8042_read_status() & I8042_STR_AUXDATA)) { dbg(" -- i8042 (auxerr)\n"); return -1; } param[i] = i8042_read_data(); dbg("%02x <- i8042 (return)\n", param[i]); } return 0; } int i8042_command(unsigned char *param, int command) { if (!i8042_present) return -1; guard(spinlock_irqsave)(&i8042_lock); return __i8042_command(param, command); } EXPORT_SYMBOL(i8042_command); /* * i8042_kbd_write() sends a byte out through the keyboard interface. */ static int i8042_kbd_write(struct serio *port, unsigned char c) { int error; guard(spinlock_irqsave)(&i8042_lock); error = i8042_wait_write(); if (error) return error; dbg("%02x -> i8042 (kbd-data)\n", c); i8042_write_data(c); return 0; } /* * i8042_aux_write() sends a byte out through the aux interface. */ static int i8042_aux_write(struct serio *serio, unsigned char c) { struct i8042_port *port = serio->port_data; return i8042_command(&c, port->mux == -1 ? I8042_CMD_AUX_SEND : I8042_CMD_MUX_SEND + port->mux); } /* * i8042_port_close attempts to clear AUX or KBD port state by disabling * and then re-enabling it. */ static void i8042_port_close(struct serio *serio) { int irq_bit; int disable_bit; const char *port_name; if (serio == i8042_ports[I8042_AUX_PORT_NO].serio) { irq_bit = I8042_CTR_AUXINT; disable_bit = I8042_CTR_AUXDIS; port_name = "AUX"; } else { irq_bit = I8042_CTR_KBDINT; disable_bit = I8042_CTR_KBDDIS; port_name = "KBD"; } i8042_ctr &= ~irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while closing %s port\n", port_name); udelay(50); i8042_ctr &= ~disable_bit; i8042_ctr |= irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_err("Can't reactivate %s port\n", port_name); /* * See if there is any data appeared while we were messing with * port state. */ i8042_handle_data(0); } /* * i8042_start() is called by serio core when port is about to finish * registering. It will mark port as existing so i8042_interrupt can * start sending data through it. */ static int i8042_start(struct serio *serio) { struct i8042_port *port = serio->port_data; device_set_wakeup_capable(&serio->dev, true); /* * On platforms using suspend-to-idle, allow the keyboard to * wake up the system from sleep by enabling keyboard wakeups * by default. This is consistent with keyboard wakeup * behavior on many platforms using suspend-to-RAM (ACPI S3) * by default. */ if (pm_suspend_default_s2idle() && serio == i8042_ports[I8042_KBD_PORT_NO].serio) { device_set_wakeup_enable(&serio->dev, true); } guard(spinlock_irq)(&i8042_lock); port->exists = true; return 0; } /* * i8042_stop() marks serio port as non-existing so i8042_interrupt * will not try to send data to the port that is about to go away. * The function is called by serio core as part of unregister procedure. */ static void i8042_stop(struct serio *serio) { struct i8042_port *port = serio->port_data; scoped_guard(spinlock_irq, &i8042_lock) { port->exists = false; port->serio = NULL; } /* * We need to make sure that interrupt handler finishes using * our serio port before we return from this function. * We synchronize with both AUX and KBD IRQs because there is * a (very unlikely) chance that AUX IRQ is raised for KBD port * and vice versa. */ synchronize_irq(I8042_AUX_IRQ); synchronize_irq(I8042_KBD_IRQ); } /* * i8042_filter() filters out unwanted bytes from the input data stream. * It is called from i8042_interrupt and thus is running with interrupts * off and i8042_lock held. */ static bool i8042_filter(unsigned char data, unsigned char str, struct serio *serio) { if (unlikely(i8042_suppress_kbd_ack)) { if ((~str & I8042_STR_AUXDATA) && (data == 0xfa || data == 0xfe)) { i8042_suppress_kbd_ack--; dbg("Extra keyboard ACK - filtered out\n"); return true; } } if (!i8042_platform_filter) return false; if (i8042_platform_filter(data, str, serio, i8042_platform_filter_context)) { dbg("Filtered out by platform filter\n"); return true; } return false; } /* * i8042_handle_mux() handles case when data is coming from one of * the multiplexed ports. It would be simple if not for quirks with * handling errors: * * When MUXERR condition is signalled the data register can only contain * 0xfd, 0xfe or 0xff if implementation follows the spec. Unfortunately * it is not always the case. Some KBCs also report 0xfc when there is * nothing connected to the port while others sometimes get confused which * port the data came from and signal error leaving the data intact. They * _do not_ revert to legacy mode (actually I've never seen KBC reverting * to legacy mode yet, when we see one we'll add proper handling). * Anyway, we process 0xfc, 0xfd, 0xfe and 0xff as timeouts, and for the * rest assume that the data came from the same serio last byte * was transmitted (if transmission happened not too long ago). */ static int i8042_handle_mux(u8 str, u8 *data, unsigned int *dfl) { static unsigned long last_transmit; static unsigned long last_port; unsigned int mux_port; mux_port = (str >> 6) & 3; *dfl = 0; if (str & I8042_STR_MUXERR) { dbg("MUX error, status is %02x, data is %02x\n", str, *data); switch (*data) { default: if (time_before(jiffies, last_transmit + HZ/10)) { mux_port = last_port; break; } fallthrough; /* report timeout */ case 0xfc: case 0xfd: case 0xfe: *dfl = SERIO_TIMEOUT; *data = 0xfe; break; case 0xff: *dfl = SERIO_PARITY; *data = 0xfe; break; } } last_port = mux_port; last_transmit = jiffies; return I8042_MUX_PORT_NO + mux_port; } /* * i8042_handle_data() is the most important function in this driver - * it reads the data from the i8042, determines its destination serio * port, and sends received byte to the upper layers. * * Returns true if there was data waiting, false otherwise. */ static bool i8042_handle_data(int irq) { struct i8042_port *port; struct serio *serio; unsigned char str, data; unsigned int dfl; unsigned int port_no; bool filtered; scoped_guard(spinlock_irqsave, &i8042_lock) { str = i8042_read_status(); if (unlikely(~str & I8042_STR_OBF)) return false; data = i8042_read_data(); if (i8042_mux_present && (str & I8042_STR_AUXDATA)) { port_no = i8042_handle_mux(str, &data, &dfl); } else { dfl = (str & I8042_STR_PARITY) ? SERIO_PARITY : 0; if ((str & I8042_STR_TIMEOUT) && !i8042_notimeout) dfl |= SERIO_TIMEOUT; port_no = (str & I8042_STR_AUXDATA) ? I8042_AUX_PORT_NO : I8042_KBD_PORT_NO; } port = &i8042_ports[port_no]; serio = port->exists ? port->serio : NULL; filter_dbg(port->driver_bound, data, "<- i8042 (interrupt, %d, %d%s%s)\n", port_no, irq, dfl & SERIO_PARITY ? ", bad parity" : "", dfl & SERIO_TIMEOUT ? ", timeout" : ""); filtered = i8042_filter(data, str, serio); } if (likely(serio && !filtered)) serio_interrupt(serio, data, dfl); return true; } static irqreturn_t i8042_interrupt(int irq, void *dev_id) { if (unlikely(!i8042_handle_data(irq))) { dbg("Interrupt %d, without any data\n", irq); return IRQ_NONE; } return IRQ_HANDLED; } /* * i8042_enable_kbd_port enables keyboard port on chip */ static int i8042_enable_kbd_port(void) { i8042_ctr &= ~I8042_CTR_KBDDIS; i8042_ctr |= I8042_CTR_KBDINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; pr_err("Failed to enable KBD port\n"); return -EIO; } return 0; } /* * i8042_enable_aux_port enables AUX (mouse) port on chip */ static int i8042_enable_aux_port(void) { i8042_ctr &= ~I8042_CTR_AUXDIS; i8042_ctr |= I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; pr_err("Failed to enable AUX port\n"); return -EIO; } return 0; } /* * i8042_enable_mux_ports enables 4 individual AUX ports after * the controller has been switched into Multiplexed mode */ static int i8042_enable_mux_ports(void) { unsigned char param; int i; for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { i8042_command(&param, I8042_CMD_MUX_PFX + i); i8042_command(&param, I8042_CMD_AUX_ENABLE); } return i8042_enable_aux_port(); } /* * i8042_set_mux_mode checks whether the controller has an * active multiplexor and puts the chip into Multiplexed (true) * or Legacy (false) mode. */ static int i8042_set_mux_mode(bool multiplex, unsigned char *mux_version) { unsigned char param, val; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - send three bytes, they should come back from the * mouse interface, the last should be version. */ param = val = 0xf0; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0x56 : 0xf6; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0xa4 : 0xa5; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param == val) return -1; /* * Workaround for interference with USB Legacy emulation * that causes a v10.12 MUX to be found. */ if (param == 0xac) return -1; if (mux_version) *mux_version = param; return 0; } /* * i8042_check_mux() checks whether the controller supports the PS/2 Active * Multiplexing specification by Synaptics, Phoenix, Insyde and * LCS/Telegraphics. */ static int i8042_check_mux(void) { unsigned char mux_version; if (i8042_set_mux_mode(true, &mux_version)) return -1; pr_info("Detected active multiplexing controller, rev %d.%d\n", (mux_version >> 4) & 0xf, mux_version & 0xf); /* * Disable all muxed ports by disabling AUX. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Failed to disable AUX port, can't use MUX\n"); return -EIO; } i8042_mux_present = true; return 0; } /* * The following is used to test AUX IRQ delivery. */ static struct completion i8042_aux_irq_delivered; static bool i8042_irq_being_tested; static irqreturn_t i8042_aux_test_irq(int irq, void *dev_id) { unsigned char str, data; guard(spinlock_irqsave)(&i8042_lock); str = i8042_read_status(); if (!(str & I8042_STR_OBF)) return IRQ_NONE; data = i8042_read_data(); dbg("%02x <- i8042 (aux_test_irq, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); if (i8042_irq_being_tested && data == 0xa5 && (str & I8042_STR_AUXDATA)) complete(&i8042_aux_irq_delivered); return IRQ_HANDLED; } /* * i8042_toggle_aux - enables or disables AUX port on i8042 via command and * verifies success by readinng CTR. Used when testing for presence of AUX * port. */ static int i8042_toggle_aux(bool on) { unsigned char param; int i; if (i8042_command(&param, on ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE)) return -1; /* some chips need some time to set the I8042_CTR_AUXDIS bit */ for (i = 0; i < 100; i++) { udelay(50); if (i8042_command(&param, I8042_CMD_CTL_RCTR)) return -1; if (!(param & I8042_CTR_AUXDIS) == on) return 0; } return -1; } /* * i8042_check_aux() applies as much paranoia as it can at detecting * the presence of an AUX interface. */ static int i8042_check_aux(void) { int retval = -1; bool irq_registered = false; bool aux_loop_broken = false; unsigned char param; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - filters out AT-type i8042's. Unfortunately * SiS screwed up and their 5597 doesn't support the LOOP command even * though it has an AUX port. */ param = 0x5a; retval = i8042_command(&param, I8042_CMD_AUX_LOOP); if (retval || param != 0x5a) { /* * External connection test - filters out AT-soldered PS/2 i8042's * 0x00 - no error, 0x01-0x03 - clock/data stuck, 0xff - general error * 0xfa - no error on some notebooks which ignore the spec * Because it's common for chipsets to return error on perfectly functioning * AUX ports, we test for this only when the LOOP command failed. */ if (i8042_command(&param, I8042_CMD_AUX_TEST) || (param && param != 0xfa && param != 0xff)) return -1; /* * If AUX_LOOP completed without error but returned unexpected data * mark it as broken */ if (!retval) aux_loop_broken = true; } /* * Bit assignment test - filters out PS/2 i8042's in AT mode */ if (i8042_toggle_aux(false)) { pr_warn("Failed to disable AUX port, but continuing anyway... Is this a SiS?\n"); pr_warn("If AUX port is really absent please use the 'i8042.noaux' option\n"); } if (i8042_toggle_aux(true)) return -1; /* * Reset keyboard (needed on some laptops to successfully detect * touchpad, e.g., some Gigabyte laptop models with Elantech * touchpads). */ if (i8042_kbdreset) { pr_warn("Attempting to reset device connected to KBD port\n"); i8042_kbd_write(NULL, (unsigned char) 0xff); } /* * Test AUX IRQ delivery to make sure BIOS did not grab the IRQ and * used it for a PCI card or somethig else. */ if (i8042_noloop || i8042_bypass_aux_irq_test || aux_loop_broken) { /* * Without LOOP command we can't test AUX IRQ delivery. Assume the port * is working and hope we are right. */ retval = 0; goto out; } if (request_irq(I8042_AUX_IRQ, i8042_aux_test_irq, IRQF_SHARED, "i8042", i8042_platform_device)) goto out; irq_registered = true; if (i8042_enable_aux_port()) goto out; scoped_guard(spinlock_irqsave, &i8042_lock) { init_completion(&i8042_aux_irq_delivered); i8042_irq_being_tested = true; param = 0xa5; retval = __i8042_command(&param, I8042_CMD_AUX_LOOP & 0xf0ff); if (retval) goto out; } if (wait_for_completion_timeout(&i8042_aux_irq_delivered, msecs_to_jiffies(250)) == 0) { /* * AUX IRQ was never delivered so we need to flush the controller to * get rid of the byte we put there; otherwise keyboard may not work. */ dbg(" -- i8042 (aux irq test timeout)\n"); i8042_flush(); retval = -1; } out: /* * Disable the interface. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) retval = -1; if (irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); return retval; } static int i8042_controller_check(void) { if (i8042_flush()) { pr_info("No controller found\n"); return -ENODEV; } return 0; } static int i8042_controller_selftest(void) { unsigned char param; int i = 0; /* * We try this 5 times; on some really fragile systems this does not * take the first time... */ do { if (i8042_command(&param, I8042_CMD_CTL_TEST)) { pr_err("i8042 controller selftest timeout\n"); return -ENODEV; } if (param == I8042_RET_CTL_TEST) return 0; dbg("i8042 controller selftest: %#x != %#x\n", param, I8042_RET_CTL_TEST); msleep(50); } while (i++ < 5); #ifdef CONFIG_X86 /* * On x86, we don't fail entire i8042 initialization if controller * reset fails in hopes that keyboard port will still be functional * and user will still get a working keyboard. This is especially * important on netbooks. On other arches we trust hardware more. */ pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else pr_err("i8042 controller selftest failed\n"); return -EIO; #endif } /* * i8042_controller_init initializes the i8042 controller, and, * most importantly, sets it into non-xlated mode if that's * desired. */ static int i8042_controller_init(void) { int n = 0; unsigned char ctr[2]; /* * Save the CTR for restore on unload / reboot. */ do { if (n >= 10) { pr_err("Unable to get stable CTR read\n"); return -EIO; } if (n != 0) udelay(50); if (i8042_command(&ctr[n++ % 2], I8042_CMD_CTL_RCTR)) { pr_err("Can't read CTR while initializing i8042\n"); return i8042_probe_defer ? -EPROBE_DEFER : -EIO; } } while (n < 2 || ctr[0] != ctr[1]); i8042_initial_ctr = i8042_ctr = ctr[0]; /* * Disable the keyboard interface and interrupt. */ i8042_ctr |= I8042_CTR_KBDDIS; i8042_ctr &= ~I8042_CTR_KBDINT; /* * Handle keylock. */ scoped_guard(spinlock_irqsave, &i8042_lock) { if (~i8042_read_status() & I8042_STR_KEYLOCK) { if (i8042_unlock) i8042_ctr |= I8042_CTR_IGNKEYLOCK; else pr_warn("Warning: Keylock active\n"); } } /* * If the chip is configured into nontranslated mode by the BIOS, don't * bother enabling translating and be happy. */ if (~i8042_ctr & I8042_CTR_XLATE) i8042_direct = true; /* * Set nontranslated mode for the kbd interface if requested by an option. * After this the kbd interface becomes a simple serial in/out, like the aux * interface is. We don't do this by default, since it can confuse notebook * BIOSes. */ if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; /* * Write CTR back. */ if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Can't write CTR while initializing i8042\n"); return -EIO; } /* * Flush whatever accumulated while we were disabling keyboard port. */ i8042_flush(); return 0; } /* * Reset the controller and reset CRT to the original value set by BIOS. */ static void i8042_controller_reset(bool s2r_wants_reset) { i8042_flush(); /* * Disable both KBD and AUX interfaces so they don't get in the way */ i8042_ctr |= I8042_CTR_KBDDIS | I8042_CTR_AUXDIS; i8042_ctr &= ~(I8042_CTR_KBDINT | I8042_CTR_AUXINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while resetting\n"); /* * Disable MUX mode if present. */ if (i8042_mux_present) i8042_set_mux_mode(false, NULL); /* * Reset the controller if requested. */ if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { i8042_controller_selftest(); } /* * Restore the original control register setting. */ if (i8042_command(&i8042_initial_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't restore CTR\n"); } /* * i8042_panic_blink() will turn the keyboard LEDs on or off and is called * when kernel panics. Flashing LEDs is useful for users running X who may * not see the console and will help distinguishing panics from "real" * lockups. * * Note that DELAY has a limit of 10ms so we will not get stuck here * waiting for KBC to free up even if KBD interrupt is off */ #define DELAY do { mdelay(1); if (++delay > 10) return delay; } while(0) static long i8042_panic_blink(int state) { long delay = 0; char led; led = (state) ? 0x01 | 0x04 : 0; while (i8042_read_status() & I8042_STR_IBF) DELAY; dbg("%02x -> i8042 (panic blink)\n", 0xed); i8042_suppress_kbd_ack = 2; i8042_write_data(0xed); /* set leds */ DELAY; while (i8042_read_status() & I8042_STR_IBF) DELAY; DELAY; dbg("%02x -> i8042 (panic blink)\n", led); i8042_write_data(led); DELAY; return delay; } #undef DELAY #ifdef CONFIG_X86 static void i8042_dritek_enable(void) { unsigned char param = 0x90; int error; error = i8042_command(&param, 0x1059); if (error) pr_warn("Failed to enable DRITEK extension: %d\n", error); } #endif #ifdef CONFIG_PM /* * Here we try to reset everything back to a state we had * before suspending. */ static int i8042_controller_resume(bool s2r_wants_reset) { int error; error = i8042_controller_check(); if (error) return error; if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { error = i8042_controller_selftest(); if (error) return error; } /* * Restore original CTR value and disable all ports */ i8042_ctr = i8042_initial_ctr; if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; i8042_ctr |= I8042_CTR_AUXDIS | I8042_CTR_KBDDIS; i8042_ctr &= ~(I8042_CTR_AUXINT | I8042_CTR_KBDINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_warn("Can't write CTR to resume, retrying...\n"); msleep(50); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("CTR write retry failed\n"); return -EIO; } } #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (i8042_mux_present) { if (i8042_set_mux_mode(true, NULL) || i8042_enable_mux_ports()) pr_warn("failed to resume active multiplexor, mouse won't work\n"); } else if (i8042_ports[I8042_AUX_PORT_NO].serio) { i8042_enable_aux_port(); } if (i8042_ports[I8042_KBD_PORT_NO].serio) i8042_enable_kbd_port(); i8042_handle_data(0); return 0; } /* * Here we try to restore the original BIOS settings to avoid * upsetting it. */ static int i8042_pm_suspend(struct device *dev) { int i; if (!i8042_forcenorestore && pm_suspend_via_firmware()) i8042_controller_reset(true); /* Set up serio interrupts for system wakeup. */ for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) enable_irq_wake(i8042_ports[i].irq); } return 0; } static int i8042_pm_resume_noirq(struct device *dev) { if (i8042_forcenorestore || !pm_resume_via_firmware()) i8042_handle_data(0); return 0; } static int i8042_pm_resume(struct device *dev) { bool want_reset; int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) disable_irq_wake(i8042_ports[i].irq); } /* * If platform firmware was not going to be involved in suspend, we did * not restore the controller state to whatever it had been at boot * time, so we do not need to do anything. */ if (i8042_forcenorestore || !pm_suspend_via_firmware()) return 0; /* * We only need to reset the controller if we are resuming after handing * off control to the platform firmware, otherwise we can simply restore * the mode. */ want_reset = pm_resume_via_firmware(); return i8042_controller_resume(want_reset); } static int i8042_pm_thaw(struct device *dev) { i8042_handle_data(0); return 0; } static int i8042_pm_reset(struct device *dev) { i8042_controller_reset(false); return 0; } static int i8042_pm_restore(struct device *dev) { return i8042_controller_resume(false); } static const struct dev_pm_ops i8042_pm_ops = { .suspend = i8042_pm_suspend, .resume_noirq = i8042_pm_resume_noirq, .resume = i8042_pm_resume, .thaw = i8042_pm_thaw, .poweroff = i8042_pm_reset, .restore = i8042_pm_restore, }; #endif /* CONFIG_PM */ /* * We need to reset the 8042 back to original mode on system shutdown, * because otherwise BIOSes will be confused. */ static void i8042_shutdown(struct platform_device *dev) { i8042_controller_reset(false); } static int i8042_create_kbd_port(void) { struct serio *serio; struct i8042_port *port = &i8042_ports[I8042_KBD_PORT_NO]; serio = kzalloc_obj(*serio); if (!serio) return -ENOMEM; serio->id.type = i8042_direct ? SERIO_8042 : SERIO_8042_XL; serio->write = i8042_dumbkbd ? NULL : i8042_kbd_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->close = i8042_port_close; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; strscpy(serio->name, "i8042 KBD port", sizeof(serio->name)); strscpy(serio->phys, I8042_KBD_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_kbd_firmware_id, sizeof(serio->firmware_id)); set_primary_fwnode(&serio->dev, i8042_kbd_fwnode); port->serio = serio; port->irq = I8042_KBD_IRQ; return 0; } static int i8042_create_aux_port(int idx) { struct serio *serio; int port_no = idx < 0 ? I8042_AUX_PORT_NO : I8042_MUX_PORT_NO + idx; struct i8042_port *port = &i8042_ports[port_no]; serio = kzalloc_obj(*serio); if (!serio) return -ENOMEM; serio->id.type = SERIO_8042; serio->write = i8042_aux_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; if (idx < 0) { strscpy(serio->name, "i8042 AUX port", sizeof(serio->name)); strscpy(serio->phys, I8042_AUX_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); serio->close = i8042_port_close; } else { snprintf(serio->name, sizeof(serio->name), "i8042 AUX%d port", idx); snprintf(serio->phys, sizeof(serio->phys), I8042_MUX_PHYS_DESC, idx + 1); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); } port->serio = serio; port->mux = idx; port->irq = I8042_AUX_IRQ; return 0; } static void i8042_free_kbd_port(void) { kfree(i8042_ports[I8042_KBD_PORT_NO].serio); i8042_ports[I8042_KBD_PORT_NO].serio = NULL; } static void i8042_free_aux_ports(void) { int i; for (i = I8042_AUX_PORT_NO; i < I8042_NUM_PORTS; i++) { kfree(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } static void i8042_register_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (!serio) continue; printk(KERN_INFO "serio: %s at %#lx,%#lx irq %d\n", serio->name, (unsigned long) I8042_DATA_REG, (unsigned long) I8042_COMMAND_REG, i8042_ports[i].irq); serio_register_port(serio); } } static void i8042_unregister_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { if (i8042_ports[i].serio) { serio_unregister_port(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } } static void i8042_free_irqs(void) { if (i8042_aux_irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); if (i8042_kbd_irq_registered) free_irq(I8042_KBD_IRQ, i8042_platform_device); i8042_aux_irq_registered = i8042_kbd_irq_registered = false; } static int i8042_setup_aux(void) { int (*aux_enable)(void); int error; int i; if (i8042_check_aux()) return -ENODEV; if (i8042_nomux || i8042_check_mux()) { error = i8042_create_aux_port(-1); if (error) goto err_free_ports; aux_enable = i8042_enable_aux_port; } else { for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { error = i8042_create_aux_port(i); if (error) goto err_free_ports; } aux_enable = i8042_enable_mux_ports; } error = request_irq(I8042_AUX_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_ports; error = aux_enable(); if (error) goto err_free_irq; i8042_aux_irq_registered = true; return 0; err_free_irq: free_irq(I8042_AUX_IRQ, i8042_platform_device); err_free_ports: i8042_free_aux_ports(); return error; } static int i8042_setup_kbd(void) { int error; error = i8042_create_kbd_port(); if (error) return error; error = request_irq(I8042_KBD_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_port; error = i8042_enable_kbd_port(); if (error) goto err_free_irq; i8042_kbd_irq_registered = true; return 0; err_free_irq: free_irq(I8042_KBD_IRQ, i8042_platform_device); err_free_port: i8042_free_kbd_port(); return error; } static int i8042_kbd_bind_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; struct serio *serio = to_serio_port(dev); struct i8042_port *port = serio->port_data; if (serio != i8042_ports[I8042_KBD_PORT_NO].serio) return 0; switch (action) { case BUS_NOTIFY_BOUND_DRIVER: port->driver_bound = true; break; case BUS_NOTIFY_UNBIND_DRIVER: port->driver_bound = false; break; } return 0; } static int i8042_probe(struct platform_device *dev) { int error; if (i8042_reset == I8042_RESET_ALWAYS) { error = i8042_controller_selftest(); if (error) return error; } error = i8042_controller_init(); if (error) return error; #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (!i8042_noaux) { error = i8042_setup_aux(); if (error && error != -ENODEV && error != -EBUSY) goto out_fail; } if (!i8042_nokbd) { error = i8042_setup_kbd(); if (error) goto out_fail; } /* * Ok, everything is ready, let's register all serio ports */ i8042_register_ports(); return 0; out_fail: i8042_free_aux_ports(); /* in case KBD failed but AUX not */ i8042_free_irqs(); i8042_controller_reset(false); return error; } static void i8042_remove(struct platform_device *dev) { i8042_unregister_ports(); i8042_free_irqs(); i8042_controller_reset(false); } static struct platform_driver i8042_driver = { .driver = { .name = "i8042", #ifdef CONFIG_PM .pm = &i8042_pm_ops, #endif }, .probe = i8042_probe, .remove = i8042_remove, .shutdown = i8042_shutdown, }; static struct notifier_block i8042_kbd_bind_notifier_block = { .notifier_call = i8042_kbd_bind_notifier, }; static int __init i8042_init(void) { int err; dbg_init(); err = i8042_platform_init(); if (err) return (err == -ENODEV) ? 0 : err; err = i8042_controller_check(); if (err) goto err_platform_exit; /* Set this before creating the dev to allow i8042_command to work right away */ i8042_present = true; err = platform_driver_register(&i8042_driver); if (err) goto err_platform_exit; i8042_platform_device = platform_device_alloc("i8042", -1); if (!i8042_platform_device) { err = -ENOMEM; goto err_unregister_driver; } err = platform_device_add(i8042_platform_device); if (err) goto err_free_device; bus_register_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = i8042_panic_blink; return 0; err_free_device: platform_device_put(i8042_platform_device); err_unregister_driver: platform_driver_unregister(&i8042_driver); err_platform_exit: i8042_platform_exit(); return err; } static void __exit i8042_exit(void) { if (!i8042_present) return; platform_device_unregister(i8042_platform_device); platform_driver_unregister(&i8042_driver); i8042_platform_exit(); bus_unregister_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = NULL; } module_init(i8042_init); module_exit(i8042_exit);
27 84 1 85 85 2 2 83 85 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 // SPDX-License-Identifier: GPL-2.0-only /* * VLAN netlink control interface * * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include "vlan.h" static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { [IFLA_VLAN_ID] = { .type = NLA_U16 }, [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) }, }; static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; u16 id; int err; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); if (id >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; } return 0; } static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); err = vlan_dev_change_flags(dev, flags->flags, flags->mask); if (err) return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) return err; } } return 0; } static int vlan_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; __be16 proto; int err; if (!data[IFLA_VLAN_ID]) { NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; } if (!tb[IFLA_LINK]) { NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; } real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; } proto = nla_get_be16_default(data[IFLA_VLAN_PROTOCOL], htons(ETH_P_8021Q)); vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); vlan->real_dev = real_dev; dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, extack); if (err < 0) return err; max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : real_dev->mtu; if (!tb[IFLA_MTU]) dev->mtu = max_mtu; else if (dev->mtu > max_mtu) return -EINVAL; /* Note: If this initial vlan_changelink() fails, we need * to call vlan_dev_free_egress_priority() to free memory. */ err = vlan_changelink(dev, tb, data, extack); if (!err) err = register_vlan_dev(dev, extack); if (err) vlan_dev_free_egress_priority(dev); return err; } static inline size_t vlan_qos_map_size(unsigned int n) { if (n == 0) return 0; /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */ return nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n; } static size_t vlan_get_size(const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */ nla_total_size(2) + /* IFLA_VLAN_ID */ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); } static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *pm; struct ifla_vlan_flags f; struct ifla_vlan_qos_mapping m; struct nlattr *nest; unsigned int i; if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) || nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id)) goto nla_put_failure; if (vlan->flags) { f.flags = vlan->flags; f.mask = ~0; if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f)) goto nla_put_failure; } if (vlan->nr_ingress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) { if (!vlan->ingress_priority_map[i]) continue; m.from = i; m.to = vlan->ingress_priority_map[i]; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (vlan->nr_egress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { for (pm = vlan->egress_priority_map[i]; pm; pm = pm->next) { if (!pm->vlan_qos) continue; m.from = pm->priority; m.to = (pm->vlan_qos >> 13) & 0x7; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vlan_get_link_net(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return dev_net(real_dev); } struct rtnl_link_ops vlan_link_ops __read_mostly = { .kind = "vlan", .maxtype = IFLA_VLAN_MAX, .policy = vlan_policy, .priv_size = sizeof(struct vlan_dev_priv), .setup = vlan_setup, .validate = vlan_validate, .newlink = vlan_newlink, .changelink = vlan_changelink, .dellink = unregister_vlan_dev, .get_size = vlan_get_size, .fill_info = vlan_fill_info, .get_link_net = vlan_get_link_net, }; int __init vlan_netlink_init(void) { return rtnl_link_register(&vlan_link_ops); } void __exit vlan_netlink_fini(void) { rtnl_link_unregister(&vlan_link_ops); } MODULE_ALIAS_RTNL_LINK("vlan");
6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 184 61 175 39 178 187 184 31 39 39 31 544 264 306 65 6 37 64 188 209 83 41 105 125 126 88 175 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 /* * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/in.h> #include <linux/in6.h> /* For in6_dev_get/in6_dev_put */ #include <net/addrconf.h> #include <net/bonding.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> static struct workqueue_struct *gid_cache_wq; enum gid_op_type { GID_DEL = 0, GID_ADD }; struct update_gid_event_work { struct work_struct work; union ib_gid gid; struct ib_gid_attr gid_attr; enum gid_op_type gid_op; }; #define ROCE_NETDEV_CALLBACK_SZ 3 struct netdev_event_work_cmd { roce_netdev_callback cb; roce_netdev_filter filter; struct net_device *ndev; struct net_device *filter_ndev; }; struct netdev_event_work { struct work_struct work; struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; static const struct { bool (*is_supported)(const struct ib_device *device, u32 port_num); enum ib_gid_type gid_type; } PORT_CAP_TO_GID_TYPE[] = { {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, }; #define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port) { int i; unsigned int ret_flags = 0; if (!rdma_protocol_roce(ib_dev, port)) return 1UL << IB_GID_TYPE_IB; for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; return ret_flags; } EXPORT_SYMBOL(roce_gid_type_mask_support); static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { int i; unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); for (i = 0; i < IB_GID_TYPE_SIZE; i++) { if ((1UL << i) & gid_type_mask) { gid_attr->gid_type = i; switch (gid_op) { case GID_ADD: ib_cache_gid_add(ib_dev, port, gid, gid_attr); break; case GID_DEL: ib_cache_gid_del(ib_dev, port, gid, gid_attr); break; } } } } enum bonding_slave_state { BONDING_SLAVE_STATE_ACTIVE = 1UL << 0, BONDING_SLAVE_STATE_INACTIVE = 1UL << 1, /* No primary slave or the device isn't a slave in bonding */ BONDING_SLAVE_STATE_NA = 1UL << 2, }; static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev, struct net_device *upper) { if (upper && netif_is_bond_master(upper)) { struct net_device *pdev = bond_option_active_slave_get_rcu(netdev_priv(upper)); if (pdev) return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE : BONDING_SLAVE_STATE_INACTIVE; } return BONDING_SLAVE_STATE_NA; } #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static bool is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); if (!real_dev) real_dev = cookie; res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); rcu_read_unlock(); return res; } static bool is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; bool res; if (!rdma_ndev) return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) == BONDING_SLAVE_STATE_INACTIVE; rcu_read_unlock(); return res; } /** * is_ndev_for_default_gid_filter - Check if a given netdevice * can be considered for default GIDs or not. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: rdma netdevice pointer * @cookie: Netdevice to consider to form a default GID * * is_ndev_for_default_gid_filter() returns true if a given netdevice can be * considered for deriving default RoCE GID, returns false otherwise. */ static bool is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool res; if (!rdma_ndev) return false; rcu_read_lock(); /* * When rdma netdevice is used in bonding, bonding master netdevice * should be considered for default GIDs. Therefore, ignore slave rdma * netdevices when bonding is considered. * Additionally when event(cookie) netdevice is bond master device, * make sure that it the upper netdevice of rdma netdevice. */ res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); rcu_read_unlock(); return res; } static bool pass_all_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { return true; } static bool upper_device_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { bool res; if (!rdma_ndev) return false; if (rdma_ndev == cookie) return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); rcu_read_unlock(); return res; } /** * is_upper_ndev_bond_master_filter - Check if a given netdevice * is bond master device of netdevice of the RDMA device of port. * @ib_dev: IB device to check * @port: Port to consider for adding default GID * @rdma_ndev: Pointer to rdma netdevice * @cookie: Netdevice to consider to form a default GID * * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev * is bond master device and rdma_ndev is its lower netdevice. It might * not have been established as slave device yet. */ static bool is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; bool match = false; if (!rdma_ndev) return false; rcu_read_lock(); if (netif_is_bond_master(cookie_ndev) && rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) match = true; rcu_read_unlock(); return match; } static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u32 port, struct net_device *ndev, struct sockaddr *addr) { union ib_gid gid; struct ib_gid_attr gid_attr; rdma_ip2gid(addr, &gid); memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; if (!rdma_ndev) return; if (!real_dev) real_dev = event_ndev; rcu_read_lock(); if (((rdma_ndev != event_ndev && !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE)) { rcu_read_unlock(); return; } rcu_read_unlock(); gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void enum_netdev_ipv4_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { const struct in_ifaddr *ifa; struct in_device *in_dev; struct sin_list { struct list_head list; struct sockaddr_in ip; }; struct sin_list *sin_iter; struct sin_list *sin_temp; LIST_HEAD(sin_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; rcu_read_lock(); in_dev = __in_dev_get_rcu(ndev); if (!in_dev) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { struct sin_list *entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) continue; entry->ip.sin_family = AF_INET; entry->ip.sin_addr.s_addr = ifa->ifa_address; list_add_tail(&entry->list, &sin_list); } rcu_read_unlock(); list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) { update_gid_ip(GID_ADD, ib_dev, port, ndev, (struct sockaddr *)&sin_iter->ip); list_del(&sin_iter->list); kfree(sin_iter); } } static void enum_netdev_ipv6_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { struct inet6_ifaddr *ifp; struct inet6_dev *in6_dev; struct sin6_list { struct list_head list; struct sockaddr_in6 sin6; }; struct sin6_list *sin6_iter; struct sin6_list *sin6_temp; struct ib_gid_attr gid_attr = {.ndev = ndev}; LIST_HEAD(sin6_list); if (ndev->reg_state >= NETREG_UNREGISTERING) return; in6_dev = in6_dev_get(ndev); if (!in6_dev) return; read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { struct sin6_list *entry = kzalloc_obj(*entry, GFP_ATOMIC); if (!entry) continue; entry->sin6.sin6_family = AF_INET6; entry->sin6.sin6_addr = ifp->addr; list_add_tail(&entry->list, &sin6_list); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) { union ib_gid gid; rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid); update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr); list_del(&sin6_iter->list); kfree(sin6_iter); } } static void _add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { enum_netdev_ipv4_ips(ib_dev, port, ndev); if (IS_ENABLED(CONFIG_IPV6)) enum_netdev_ipv6_ips(ib_dev, port, ndev); } static void add_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { _add_netdev_ips(ib_dev, port, cookie); } static void del_netdev_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } /** * del_default_gids - Delete default GIDs of the event/cookie netdevice * @ib_dev: RDMA device pointer * @port: Port of the RDMA device whose GID table to consider * @rdma_ndev: Unused rdma netdevice * @cookie: Pointer to event netdevice * * del_default_gids() deletes the default GIDs of the event/cookie netdevice. */ static void del_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *cookie_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } static void add_default_gids(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *event_ndev = cookie; unsigned long gid_type_mask; gid_type_mask = roce_gid_type_mask_support(ib_dev, port); ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net *net; struct net_device *ndev; /* Lock the rtnl to make sure the netdevs does not move under * our feet */ rtnl_lock(); down_read(&net_rwsem); for_each_net(net) for_each_netdev(net, ndev) { /* * Filter and add default GIDs of the primary netdevice * when not in bonding mode, or add default GIDs * of bond master device, when in bonding mode. */ if (is_ndev_for_default_gid_filter(ib_dev, port, rdma_ndev, ndev)) add_default_gids(ib_dev, port, rdma_ndev, ndev); if (is_eth_port_of_netdev_filter(ib_dev, port, rdma_ndev, ndev)) _add_netdev_ips(ib_dev, port, ndev); } up_read(&net_rwsem); rtnl_unlock(); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @ib_dev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } EXPORT_SYMBOL(rdma_roce_rescan_device); /** * rdma_roce_rescan_port - Rescan all of the network devices in the system * and add their gids if relevant to the port of the RoCE device. * * @ib_dev: IB device * @port: Port number */ void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port) { struct net_device *ndev = NULL; if (rdma_protocol_roce(ib_dev, port)) { ndev = ib_device_get_netdev(ib_dev, port); if (!ndev) return; enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev); dev_put(ndev); } } EXPORT_SYMBOL(rdma_roce_rescan_port); static void callback_for_addr_gid_device_scan(struct ib_device *device, u32 port, struct net_device *rdma_ndev, void *cookie) { struct update_gid_event_work *parsed = cookie; return update_gid(parsed->gid_op, device, port, &parsed->gid, &parsed->gid_attr); } struct upper_list { struct list_head list; struct net_device *upper; }; static int netdev_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct upper_list *entry = kmalloc_obj(*entry, GFP_ATOMIC); struct list_head *upper_list = (struct list_head *)priv->data; if (!entry) return 0; list_add_tail(&entry->list, upper_list); dev_hold(upper); entry->upper = upper; return 0; } static void handle_netdev_upper(struct ib_device *ib_dev, u32 port, void *cookie, void (*handle_netdev)(struct ib_device *ib_dev, u32 port, struct net_device *ndev)) { struct net_device *ndev = cookie; struct netdev_nested_priv priv; struct upper_list *upper_iter; struct upper_list *upper_temp; LIST_HEAD(upper_list); priv.data = &upper_list; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv); rcu_read_unlock(); handle_netdev(ib_dev, port, ndev); list_for_each_entry_safe(upper_iter, upper_temp, &upper_list, list) { handle_netdev(ib_dev, port, upper_iter->upper); dev_put(upper_iter->upper); list_del(&upper_iter->list); kfree(upper_iter); } } void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev) { ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev); } EXPORT_SYMBOL(roce_del_all_netdev_gids); static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids); } static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips); } static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port, struct net_device *rdma_ndev, void *cookie) { struct net_device *master_ndev; rcu_read_lock(); master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev); dev_hold(master_ndev); rcu_read_unlock(); if (master_ndev) { bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, master_ndev); dev_put(master_ndev); } } /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. */ static void netdevice_event_work_handler(struct work_struct *_work) { struct netdev_event_work *work = container_of(_work, struct netdev_event_work, work); unsigned int i; for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) { ib_enum_all_roce_netdevs(work->cmds[i].filter, work->cmds[i].filter_ndev, work->cmds[i].cb, work->cmds[i].ndev); dev_put(work->cmds[i].ndev); dev_put(work->cmds[i].filter_ndev); } kfree(work); } static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, struct net_device *ndev) { unsigned int i; struct netdev_event_work *ndev_work = kmalloc_obj(*ndev_work); if (!ndev_work) return NOTIFY_DONE; memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds)); for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) { if (!ndev_work->cmds[i].ndev) ndev_work->cmds[i].ndev = ndev; if (!ndev_work->cmds[i].filter_ndev) ndev_work->cmds[i].filter_ndev = ndev; dev_hold(ndev_work->cmds[i].ndev); dev_hold(ndev_work->cmds[i].filter_ndev); } INIT_WORK(&ndev_work->work, netdevice_event_work_handler); queue_work(gid_cache_wq, &ndev_work->work); return NOTIFY_DONE; } static const struct netdev_event_work_cmd add_cmd = { .cb = add_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd add_cmd_upper_ips = { .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev_filter }; static void ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd upper_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter }; cmds[0] = upper_ips_del_cmd; cmds[0].ndev = changeupper_info->upper_dev; cmds[1] = add_cmd; } static const struct netdev_event_work_cmd bonding_default_add_cmd = { .cb = add_default_gids, .filter = is_upper_ndev_bond_master_filter }; static void ndev_event_link(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { static const struct netdev_event_work_cmd bonding_default_del_cmd = { .cb = del_default_gids, .filter = is_upper_ndev_bond_master_filter }; /* * When a lower netdev is linked to its upper bonding * netdev, delete lower slave netdev's default GIDs. */ cmds[0] = bonding_default_del_cmd; cmds[0].ndev = event_ndev; cmds[0].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device default GIDs */ cmds[1] = bonding_default_add_cmd; cmds[1].ndev = changeupper_info->upper_dev; cmds[1].filter_ndev = changeupper_info->upper_dev; /* Now add bonding upper device IP based GIDs */ cmds[2] = add_cmd_upper_ips; cmds[2].ndev = changeupper_info->upper_dev; cmds[2].filter_ndev = changeupper_info->upper_dev; } static void netdevice_event_changeupper(struct net_device *event_ndev, struct netdev_notifier_changeupper_info *changeupper_info, struct netdev_event_work_cmd *cmds) { if (changeupper_info->linking) ndev_event_link(event_ndev, changeupper_info, cmds); else ndev_event_unlink(changeupper_info, cmds); } static const struct netdev_event_work_cmd add_default_gid_cmd = { .cb = add_default_gids, .filter = is_ndev_for_default_gid_filter, }; static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave_filter }; static const struct netdev_event_work_cmd netdev_del_cmd = { .cb = del_netdev_ips, .filter = is_eth_port_of_netdev_filter }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} }; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: if (ndev->reg_state < NETREG_UNREGISTERED) cmds[0] = del_cmd; else return NOTIFY_DONE; break; case NETDEV_CHANGEADDR: cmds[0] = netdev_del_cmd; if (ndev->reg_state == NETREG_REGISTERED) { cmds[1] = add_default_gid_cmd; cmds[2] = add_cmd; } break; case NETDEV_CHANGEUPPER: netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; /* Add default GIDs of the bond device */ cmds[1] = bonding_default_add_cmd; /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; default: return NOTIFY_DONE; } return netdevice_queue_work(cmds, ndev); } static void update_gid_event_work_handler(struct work_struct *_work) { struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); kfree(work); } static int addr_event(struct notifier_block *this, unsigned long event, struct sockaddr *sa, struct net_device *ndev) { struct update_gid_event_work *work; enum gid_op_type gid_op; if (ndev->type != ARPHRD_ETHER) return NOTIFY_DONE; switch (event) { case NETDEV_UP: gid_op = GID_ADD; break; case NETDEV_DOWN: gid_op = GID_DEL; break; default: return NOTIFY_DONE; } work = kmalloc_obj(*work, GFP_ATOMIC); if (!work) return NOTIFY_DONE; INIT_WORK(&work->work, update_gid_event_work_handler); rdma_ip2gid(sa, &work->gid); work->gid_op = gid_op; memset(&work->gid_attr, 0, sizeof(work->gid_attr)); dev_hold(ndev); work->gid_attr.ndev = ndev; queue_work(gid_cache_wq, &work->work); return NOTIFY_DONE; } static int inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in in; struct net_device *ndev; struct in_ifaddr *ifa = ptr; in.sin_family = AF_INET; in.sin_addr.s_addr = ifa->ifa_address; ndev = ifa->ifa_dev->dev; return addr_event(this, event, (struct sockaddr *)&in, ndev); } static int inet6addr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct sockaddr_in6 in6; struct net_device *ndev; struct inet6_ifaddr *ifa6 = ptr; in6.sin6_family = AF_INET6; in6.sin6_addr = ifa6->addr; ndev = ifa6->idev->dev; return addr_event(this, event, (struct sockaddr *)&in6, ndev); } static struct notifier_block nb_netdevice = { .notifier_call = netdevice_event }; static struct notifier_block nb_inetaddr = { .notifier_call = inetaddr_event }; static struct notifier_block nb_inet6addr = { .notifier_call = inet6addr_event }; int __init roce_gid_mgmt_init(void) { gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0); if (!gid_cache_wq) return -ENOMEM; register_inetaddr_notifier(&nb_inetaddr); if (IS_ENABLED(CONFIG_IPV6)) register_inet6addr_notifier(&nb_inet6addr); /* We relay on the netdevice notifier to enumerate all * existing devices in the system. Register to this notifier * last to make sure we will not miss any IP add/del * callbacks. */ register_netdevice_notifier(&nb_netdevice); return 0; } void __exit roce_gid_mgmt_cleanup(void) { if (IS_ENABLED(CONFIG_IPV6)) unregister_inet6addr_notifier(&nb_inet6addr); unregister_inetaddr_notifier(&nb_inetaddr); unregister_netdevice_notifier(&nb_netdevice); /* Ensure all gid deletion tasks complete before we go down, * to avoid any reference to free'd memory. By the time * ib-core is removed, all physical devices have been removed, * so no issue with remaining hardware contexts. */ destroy_workqueue(gid_cache_wq); }
540 540 138 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 // SPDX-License-Identifier: GPL-2.0-or-later /* * vrf.c: device driver to encapsulate a VRF space * * Copyright (c) 2015 Cumulus Networks. All rights reserved. * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> * * Based on dummy, team and ipvlan drivers */ #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ip.h> #include <linux/init.h> #include <linux/moduleparam.h> #include <linux/netfilter.h> #include <linux/rtnetlink.h> #include <net/rtnetlink.h> #include <linux/u64_stats_sync.h> #include <linux/hashtable.h> #include <linux/spinlock_types.h> #include <linux/inetdevice.h> #include <net/arp.h> #include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #include <net/fib_rules.h> #include <net/netdev_lock.h> #include <net/sch_generic.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack.h> #define DRV_NAME "vrf" #define DRV_VERSION "1.1" #define FIB_RULE_PREF 1000 /* default preference for FIB rules */ #define HT_MAP_BITS 4 #define HASH_INITVAL ((u32)0xcafef00d) struct vrf_map { DECLARE_HASHTABLE(ht, HT_MAP_BITS); spinlock_t vmap_lock; /* shared_tables: * count how many distinct tables do not comply with the strict mode * requirement. * shared_tables value must be 0 in order to enable the strict mode. * * example of the evolution of shared_tables: * | time * add vrf0 --> table 100 shared_tables = 0 | t0 * add vrf1 --> table 101 shared_tables = 0 | t1 * add vrf2 --> table 100 shared_tables = 1 | t2 * add vrf3 --> table 100 shared_tables = 1 | t3 * add vrf4 --> table 101 shared_tables = 2 v t4 * * shared_tables is a "step function" (or "staircase function") * and it is increased by one when the second vrf is associated to a * table. * * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1. * * at t3, another dev (vrf3) is bound to the same table 100 but the * value of shared_tables is still 1. * This means that no matter how many new vrfs will register on the * table 100, the shared_tables will not increase (considering only * table 100). * * at t4, vrf4 is bound to table 101, and shared_tables = 2. * * Looking at the value of shared_tables we can immediately know if * the strict_mode can or cannot be enforced. Indeed, strict_mode * can be enforced iff shared_tables = 0. * * Conversely, shared_tables is decreased when a vrf is de-associated * from a table with exactly two associated vrfs. */ u32 shared_tables; bool strict_mode; }; struct vrf_map_elem { struct hlist_node hnode; struct list_head vrf_list; /* VRFs registered to this table */ u32 table_id; int users; int ifindex; }; static unsigned int vrf_net_id; /* per netns vrf data */ struct netns_vrf { /* protected by rtnl lock */ bool add_fib_rules; struct vrf_map vmap; struct ctl_table_header *ctl_hdr; }; struct net_vrf { struct rtable __rcu *rth; struct rt6_info __rcu *rt6; #if IS_ENABLED(CONFIG_IPV6) struct fib6_table *fib6_table; #endif u32 tb_id; struct list_head me_list; /* entry in vrf_map_elem */ int ifindex; }; static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) { vrf_dev->stats.tx_errors++; kfree_skb(skb); } static struct vrf_map *netns_vrf_map(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); return &nn_vrf->vmap; } static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev) { return netns_vrf_map(dev_net(dev)); } static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me) { struct list_head *me_head = &me->vrf_list; struct net_vrf *vrf; if (list_empty(me_head)) return -ENODEV; vrf = list_first_entry(me_head, struct net_vrf, me_list); return vrf->ifindex; } static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags) { struct vrf_map_elem *me; me = kmalloc_obj(*me, flags); if (!me) return NULL; return me; } static void vrf_map_elem_free(struct vrf_map_elem *me) { kfree(me); } static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id, int ifindex, int users) { me->table_id = table_id; me->ifindex = ifindex; me->users = users; INIT_LIST_HEAD(&me->vrf_list); } static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap, u32 table_id) { struct vrf_map_elem *me; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_for_each_possible(vmap->ht, me, hnode, key) { if (me->table_id == table_id) return me; } return NULL; } static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me) { u32 table_id = me->table_id; u32 key; key = jhash_1word(table_id, HASH_INITVAL); hash_add(vmap->ht, &me->hnode, key); } static void vrf_map_del_elem(struct vrf_map_elem *me) { hash_del(&me->hnode); } static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock) { spin_lock(&vmap->vmap_lock); } static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock) { spin_unlock(&vmap->vmap_lock); } /* called with rtnl lock held */ static int vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); struct vrf_map_elem *new_me, *me; u32 table_id = vrf->tb_id; bool free_new_me = false; int users; int res; /* we pre-allocate elements used in the spin-locked section (so that we * keep the spinlock as short as possible). */ new_me = vrf_map_elem_alloc(GFP_KERNEL); if (!new_me) return -ENOMEM; vrf_map_elem_init(new_me, table_id, dev->ifindex, 0); vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) { me = new_me; vrf_map_add_elem(vmap, me); goto link_vrf; } /* we already have an entry in the vrf_map, so it means there is (at * least) a vrf registered on the specific table. */ free_new_me = true; if (vmap->strict_mode) { /* vrfs cannot share the same table */ NL_SET_ERR_MSG(extack, "Table is used by another VRF"); res = -EBUSY; goto unlock; } link_vrf: users = ++me->users; if (users == 2) ++vmap->shared_tables; list_add(&vrf->me_list, &me->vrf_list); res = 0; unlock: vrf_map_unlock(vmap); /* clean-up, if needed */ if (free_new_me) vrf_map_elem_free(new_me); return res; } /* called with rtnl lock held */ static void vrf_map_unregister_dev(struct net_device *dev) { struct vrf_map *vmap = netns_vrf_map_by_dev(dev); struct net_vrf *vrf = netdev_priv(dev); u32 table_id = vrf->tb_id; struct vrf_map_elem *me; int users; vrf_map_lock(vmap); me = vrf_map_lookup_elem(vmap, table_id); if (!me) goto unlock; list_del(&vrf->me_list); users = --me->users; if (users == 1) { --vmap->shared_tables; } else if (users == 0) { vrf_map_del_elem(me); /* no one will refer to this element anymore */ vrf_map_elem_free(me); } unlock: vrf_map_unlock(vmap); } /* return the vrf device index associated with the table_id */ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) { struct vrf_map *vmap = netns_vrf_map(net); struct vrf_map_elem *me; int ifindex; vrf_map_lock(vmap); if (!vmap->strict_mode) { ifindex = -EPERM; goto unlock; } me = vrf_map_lookup_elem(vmap, table_id); if (!me) { ifindex = -ENODEV; goto unlock; } ifindex = vrf_map_elem_get_vrf_ifindex(me); unlock: vrf_map_unlock(vmap); return ifindex; } /* by default VRF devices do not have a qdisc and are expected * to be created with only a single queue. */ static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx * path, similar to loopback handling. */ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst) { unsigned int len = skb->len; skb_orphan(skb); skb_dst_set(skb, dst); /* set pkt_type to avoid skb hitting packet taps twice - * once on Tx and again in Rx processing */ skb->pkt_type = PACKET_LOOPBACK; skb->protocol = eth_type_trans(skb, dev); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_dstats_rx_add(dev, len); else dev_dstats_rx_dropped(dev); return NETDEV_TX_OK; } static void vrf_nf_set_untracked(struct sk_buff *skb) { if (skb_get_nfct(skb) == 0) nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } static void vrf_nf_reset_ct(struct sk_buff *skb) { if (skb_get_nfct(skb) == IP_CT_UNTRACKED) nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) static int vrf_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) goto err; iph = ipv6_hdr(skb); memset(&fl6, 0, sizeof(fl6)); /* needed to match OIF rule */ fl6.flowi6_l3mdev = dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = iph->daddr; fl6.saddr = iph->saddr; fl6.flowlabel = ip6_flowinfo(iph); fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = iph->nexthdr; dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL); if (IS_ERR(dst) || dst == dst_null) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (dst->dev == dev) return vrf_local_xmit(skb, dev, dst); skb_dst_set(skb, dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ret = vrf_ip6_local_out(net, skb->sk, skb); if (unlikely(net_xmit_eval(ret))) dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; return ret; err: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #else static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { vrf_tx_error(dev, skb); return NET_XMIT_DROP; } #endif /* based on ip_local_out; can't use it b/c the dst is switched pointing to us */ static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; vrf_nf_reset_ct(skb); err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { struct iphdr *ip4h; int ret = NET_XMIT_DROP; struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) goto err; ip4h = ip_hdr(skb); memset(&fl4, 0, sizeof(fl4)); /* needed to match OIF rule */ fl4.flowi4_l3mdev = vrf_dev->ifindex; fl4.flowi4_iif = LOOPBACK_IFINDEX; fl4.flowi4_dscp = ip4h_dscp(ip4h); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = ip4h->protocol; fl4.daddr = ip4h->daddr; fl4.saddr = ip4h->saddr; rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; skb_dst_drop(skb); /* if dst.dev is the VRF device again this is locally originated traffic * destined to a local address. Short circuit to Rx path. */ if (rt->dst.dev == vrf_dev) return vrf_local_xmit(skb, vrf_dev, &rt->dst); skb_dst_set(skb, &rt->dst); /* strip the ethernet header added for pass through VRF device */ __skb_pull(skb, skb_network_offset(skb)); if (!ip4h->saddr) { ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, RT_SCOPE_LINK); } memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(net_xmit_eval(ret))) vrf_dev->stats.tx_errors++; else ret = NET_XMIT_SUCCESS; out: return ret; err: vrf_tx_error(vrf_dev, skb); goto out; } static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) { switch (skb->protocol) { case htons(ETH_P_IP): return vrf_process_v4_outbound(skb, dev); case htons(ETH_P_IPV6): return vrf_process_v6_outbound(skb, dev); default: vrf_tx_error(dev, skb); return NET_XMIT_DROP; } } static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned int len = skb->len; netdev_tx_t ret; ret = is_ip_tx_frame(skb, dev); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return ret; } static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; if (!list_empty(&vrf_dev->ptype_all) && likely(skb_headroom(skb) >= ETH_HLEN)) { struct ethhdr *eth = skb_push(skb, ETH_HLEN); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth_zero_addr(eth->h_dest); eth->h_proto = skb->protocol; rcu_read_lock_bh(); dev_queue_xmit_nit(skb, vrf_dev); rcu_read_unlock_bh(); skb_pull(skb, ETH_HLEN); } vrf_nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) /* modelled after ip6_finish_output2 */ static int vrf_finish_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; const struct in6_addr *nexthop; struct neighbour *neigh; int ret; vrf_nf_reset_ct(skb); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); rcu_read_unlock(); return ret; } rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } /* modelled after ip6_output */ static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb_dst(skb)->dev, vrf_finish_output6, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rt6_info *rt6; rcu_read_lock(); rt6 = rcu_dereference(vrf->rt6); if (likely(rt6)) { dst = &rt6->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output6_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip6_local_out(net, sk, skb); } static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IPV6); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output6_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip6_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert link scope packets */ if (rt6_need_strict(&ipv6_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return vrf_ip6_out_direct(vrf_dev, sk, skb); return vrf_ip6_out_redirect(vrf_dev, skb); } /* holding rtnl */ static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { struct rt6_info *rt6 = rtnl_dereference(vrf->rt6); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rt6, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rt6) { dst = &rt6->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rt6_create(struct net_device *dev) { int flags = DST_NOPOLICY | DST_NOXFRM; struct net_vrf *vrf = netdev_priv(dev); struct net *net = dev_net(dev); struct rt6_info *rt6; int rc = -ENOMEM; /* IPv6 can be CONFIG enabled and then disabled runtime */ if (!ipv6_mod_enabled()) return 0; vrf->fib6_table = fib6_new_table(net, vrf->tb_id); if (!vrf->fib6_table) goto out; /* create a dst for routing packets out a VRF device */ rt6 = ip6_dst_alloc(net, dev, flags); if (!rt6) goto out; rt6->dst.output = vrf_output6; rcu_assign_pointer(vrf->rt6, rt6); rc = 0; out: return rc; } #else static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { return skb; } static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf) { } static int vrf_rt6_create(struct net_device *dev) { return 0; } #endif /* modelled after ip_finish_output2 */ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); struct net_device *dev = dst->dev; unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; vrf_nf_reset_ct(skb); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); if (!skb) { dev->stats.tx_errors++; return -ENOMEM; } } rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { int ret; sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); rcu_read_unlock(); return ret; } rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb_dst(skb)->dev; IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); skb->dev = dev; skb->protocol = htons(ETH_P_IP); return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, dev, vrf_finish_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } /* set dst on skb to send packet to us via dev_xmit path. Allows * packet to go through device based features such as qdisc, netfilter * hooks and packet sockets with skb->dev set to vrf device. */ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_vrf *vrf = netdev_priv(vrf_dev); struct dst_entry *dst = NULL; struct rtable *rth; rcu_read_lock(); rth = rcu_dereference(vrf->rth); if (likely(rth)) { dst = &rth->dst; dst_hold(dst); } rcu_read_unlock(); if (unlikely(!dst)) { vrf_tx_error(vrf_dev, skb); return NULL; } skb_dst_drop(skb); skb_dst_set(skb, dst); return skb; } static int vrf_output_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { vrf_finish_direct(skb); return vrf_ip_local_out(net, sk, skb); } static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { int err = 1; skb->protocol = htons(ETH_P_IP); if (!(IPCB(skb)->flags & IPSKB_REROUTED)) err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, NULL, skb->dev, vrf_output_direct_finish); if (likely(err == 1)) vrf_finish_direct(skb); return err; } static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) err = vrf_ip_local_out(net, sk, skb); return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(vrf_dev); int err; skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); if (likely(err == 1)) return skb; return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb) { /* don't divert multicast or local broadcast */ if (ipv4_is_multicast(ip_hdr(skb)->daddr) || ipv4_is_lbcast(ip_hdr(skb)->daddr)) return skb; vrf_nf_set_untracked(skb); if (qdisc_tx_is_default(vrf_dev) || IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return vrf_ip_out_direct(vrf_dev, sk, skb); return vrf_ip_out_redirect(vrf_dev, skb); } /* called with rcu lock held */ static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev, struct sock *sk, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_out(vrf_dev, sk, skb); case AF_INET6: return vrf_ip6_out(vrf_dev, sk, skb); } return skb; } /* holding rtnl */ static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf) { struct rtable *rth = rtnl_dereference(vrf->rth); struct net *net = dev_net(dev); struct dst_entry *dst; RCU_INIT_POINTER(vrf->rth, NULL); synchronize_rcu(); /* move dev in dst's to loopback so this VRF device can be deleted * - based on dst_ifdown */ if (rth) { dst = &rth->dst; netdev_ref_replace(dst->dev, net->loopback_dev, &dst->dev_tracker, GFP_KERNEL); dst->dev = net->loopback_dev; dst_release(dst); } } static int vrf_rtable_create(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); struct rtable *rth; if (!fib_new_table(dev_net(dev), vrf->tb_id)) return -ENOMEM; /* create a dst for routing packets out through a VRF device */ rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1); if (!rth) return -ENOMEM; rth->dst.output = vrf_output; rcu_assign_pointer(vrf->rth, rth); return 0; } /**************************** device handling ********************/ /* cycle interface to flush neighbor cache and move routes across tables */ static void cycle_netdev(struct net_device *dev, struct netlink_ext_ack *extack) { unsigned int flags = dev->flags; int ret; if (!netif_running(dev)) return; ret = dev_change_flags(dev, flags & ~IFF_UP, extack); if (ret >= 0) ret = dev_change_flags(dev, flags, extack); if (ret < 0) { netdev_err(dev, "Failed to cycle device %s; route tables might be wrong!\n", dev->name); } } static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { int ret; /* do not allow loopback device to be enslaved to a VRF. * The vrf device acts as the loopback for the vrf. */ if (port_dev == dev_net(dev)->loopback_dev) { NL_SET_ERR_MSG(extack, "Can not enslave loopback device to a VRF"); return -EOPNOTSUPP; } port_dev->priv_flags |= IFF_L3MDEV_SLAVE; ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack); if (ret < 0) goto err; cycle_netdev(port_dev, extack); return 0; err: port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; return ret; } static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev, struct netlink_ext_ack *extack) { if (netif_is_l3_master(port_dev)) { NL_SET_ERR_MSG(extack, "Can not enslave an L3 master device to a VRF"); return -EINVAL; } if (netif_is_l3_slave(port_dev)) return -EINVAL; return do_vrf_add_slave(dev, port_dev, extack); } /* inverse of do_vrf_add_slave */ static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { netdev_upper_dev_unlink(port_dev, dev); port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; cycle_netdev(port_dev, NULL); return 0; } static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) { return do_vrf_del_slave(dev, port_dev); } static void vrf_dev_uninit(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); vrf_rtable_release(dev, vrf); vrf_rt6_release(dev, vrf); } static int vrf_dev_init(struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); /* create the default dst which points back to us */ if (vrf_rtable_create(dev) != 0) goto out_nomem; if (vrf_rt6_create(dev) != 0) goto out_rth; dev->flags = IFF_MASTER | IFF_NOARP; /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; netdev_lockdep_set_classes(dev); return 0; out_rth: vrf_rtable_release(dev, vrf); out_nomem: return -ENOMEM; } static const struct net_device_ops vrf_netdev_ops = { .ndo_init = vrf_dev_init, .ndo_uninit = vrf_dev_uninit, .ndo_start_xmit = vrf_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_add_slave = vrf_add_slave, .ndo_del_slave = vrf_del_slave, }; static u32 vrf_fib_table(const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return vrf->tb_id; } static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook, struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1) skb = NULL; /* kfree_skb(skb) handled by nf code */ return skb; } static int vrf_prepare_mac_header(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto) { struct ethhdr *eth; int err; /* in general, we do not know if there is enough space in the head of * the packet for hosting the mac header. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev)); if (unlikely(err)) /* no space in the skb head */ return -ENOBUFS; __skb_push(skb, ETH_HLEN); eth = (struct ethhdr *)skb->data; skb_reset_mac_header(skb); skb_reset_mac_len(skb); /* we set the ethernet destination and the source addresses to the * address of the VRF device. */ ether_addr_copy(eth->h_dest, vrf_dev->dev_addr); ether_addr_copy(eth->h_source, vrf_dev->dev_addr); eth->h_proto = htons(proto); /* the destination address of the Ethernet frame corresponds to the * address set on the VRF interface; therefore, the packet is intended * to be processed locally. */ skb->protocol = eth->h_proto; skb->pkt_type = PACKET_HOST; skb_postpush_rcsum(skb, skb->data, ETH_HLEN); skb_pull_inline(skb, ETH_HLEN); return 0; } /* prepare and add the mac header to the packet if it was not set previously. * In this way, packet sniffers such as tcpdump can parse the packet correctly. * If the mac header was already set, the original mac header is left * untouched and the function returns immediately. */ static int vrf_add_mac_header_if_unset(struct sk_buff *skb, struct net_device *vrf_dev, u16 proto, struct net_device *orig_dev) { if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev)) return 0; return vrf_prepare_mac_header(skb, vrf_dev, proto); } #if IS_ENABLED(CONFIG_IPV6) /* neighbor handling is done with actual device; do not want * to flip skb->dev for those ndisc packets. This really fails * for multiple next protocols (e.g., NEXTHDR_HOP). But it is * a start. */ static bool ipv6_ndisc_frame(const struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); bool rc = false; if (iph->nexthdr == NEXTHDR_ICMP) { const struct icmp6hdr *icmph; struct icmp6hdr _icmph; icmph = skb_header_pointer(skb, sizeof(*iph), sizeof(_icmph), &_icmph); if (!icmph) goto out; switch (icmph->icmp6_type) { case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: rc = true; break; } } out: return rc; } static struct rt6_info *vrf_ip6_route_lookup(struct net *net, const struct net_device *dev, struct flowi6 *fl6, int ifindex, const struct sk_buff *skb, int flags) { struct net_vrf *vrf = netdev_priv(dev); return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags); } static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, int ifindex) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { .flowi6_iif = ifindex, .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; skb_dst_drop(skb); rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb, RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE); if (unlikely(!rt6)) return; if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst)) return; skb_dst_set(skb, &rt6->dst); } static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { int orig_iif = skb->skb_iif; bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For non-loopback strict packets, determine the dst using the original * ifindex. */ if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) { skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IP6CB(skb)->flags |= IP6SKB_L3SLAVE; if (skb->pkt_type == PACKET_LOOPBACK) skb->pkt_type = PACKET_HOST; else vrf_ip6_input_dst(skb, vrf_dev, orig_iif); goto out; } /* if packet is NDISC then keep the ingress interface */ if (!is_ndisc) { struct net_device *orig_dev = skb->dev; dev_dstats_rx_add(vrf_dev, skb->len); skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IPV6, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } IP6CB(skb)->flags |= IP6SKB_L3SLAVE; } if (need_strict) vrf_ip6_input_dst(skb, vrf_dev, orig_iif); skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } #else static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { return skb; } #endif static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, struct sk_buff *skb) { struct net_device *orig_dev = skb->dev; skb->dev = vrf_dev; skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; /* loopback traffic; do not push through packet taps again. * Reset pkt_type for upper layers to process skb */ if (skb->pkt_type == PACKET_LOOPBACK) { skb->pkt_type = PACKET_HOST; goto out; } dev_dstats_rx_add(vrf_dev, skb->len); if (!list_empty(&vrf_dev->ptype_all)) { int err; err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP, orig_dev); if (likely(!err)) { skb_push(skb, skb->mac_len); dev_queue_xmit_nit(skb, vrf_dev); skb_pull(skb, skb->mac_len); } } skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev); out: return skb; } /* called with rcu lock held */ static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev, struct sk_buff *skb, u16 proto) { switch (proto) { case AF_INET: return vrf_ip_rcv(vrf_dev, skb); case AF_INET6: return vrf_ip6_rcv(vrf_dev, skb); } return skb; } #if IS_ENABLED(CONFIG_IPV6) /* send to link-local or multicast address via interface enslaved to * VRF device. Force lookup to VRF table without changing flow struct * Note: Caller to this function must hold rcu_read_lock() and no refcnt * is taken on the dst by this function. */ static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev, struct flowi6 *fl6) { struct net *net = dev_net(dev); int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF; struct dst_entry *dst = NULL; struct rt6_info *rt; /* VRF device does not have a link-local address and * sending packets to link-local or mcast addresses over * a VRF device does not make sense */ if (fl6->flowi6_oif == dev->ifindex) { dst = &net->ipv6.ip6_null_entry->dst; return dst; } if (!ipv6_addr_any(&fl6->saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags); if (rt) dst = &rt->dst; return dst; } #endif static const struct l3mdev_ops vrf_l3mdev_ops = { .l3mdev_fib_table = vrf_fib_table, .l3mdev_l3_rcv = vrf_l3_rcv, .l3mdev_l3_out = vrf_l3_out, #if IS_ENABLED(CONFIG_IPV6) .l3mdev_link_scope_lookup = vrf_link_scope_lookup, #endif }; static void vrf_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, DRV_NAME, sizeof(info->driver)); strscpy(info->version, DRV_VERSION, sizeof(info->version)); } static const struct ethtool_ops vrf_ethtool_ops = { .get_drvinfo = vrf_get_drvinfo, }; static inline size_t vrf_fib_rule_nl_size(void) { size_t sz; sz = NLMSG_ALIGN(sizeof(struct fib_rule_hdr)); sz += nla_total_size(sizeof(u8)); /* FRA_L3MDEV */ sz += nla_total_size(sizeof(u32)); /* FRA_PRIORITY */ sz += nla_total_size(sizeof(u8)); /* FRA_PROTOCOL */ return sz; } static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it) { struct fib_rule_hdr *frh; struct nlmsghdr *nlh; struct sk_buff *skb; int err; if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) && !ipv6_mod_enabled()) return 0; skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0); if (!nlh) goto nla_put_failure; /* rule only needs to appear once */ nlh->nlmsg_flags |= NLM_F_EXCL; frh = nlmsg_data(nlh); memset(frh, 0, sizeof(*frh)); frh->family = family; frh->action = FR_ACT_TO_TBL; if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL)) goto nla_put_failure; if (nla_put_u8(skb, FRA_L3MDEV, 1)) goto nla_put_failure; if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF)) goto nla_put_failure; nlmsg_end(skb, nlh); if (add_it) { err = fib_newrule(dev_net(dev), skb, nlh, NULL, true); if (err == -EEXIST) err = 0; } else { err = fib_delrule(dev_net(dev), skb, nlh, NULL, true); if (err == -ENOENT) err = 0; } nlmsg_free(skb); return err; nla_put_failure: nlmsg_free(skb); return -EMSGSIZE; } static int vrf_add_fib_rules(const struct net_device *dev) { int err; err = vrf_fib_rule(dev, AF_INET, true); if (err < 0) goto out_err; err = vrf_fib_rule(dev, AF_INET6, true); if (err < 0) goto ipv6_err; #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true); if (err < 0) goto ipmr_err; #endif #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true); if (err < 0) goto ip6mr_err; #endif return 0; #if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES) ip6mr_err: vrf_fib_rule(dev, RTNL_FAMILY_IPMR, false); #endif #if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES) ipmr_err: vrf_fib_rule(dev, AF_INET6, false); #endif ipv6_err: vrf_fib_rule(dev, AF_INET, false); out_err: netdev_err(dev, "Failed to add FIB rules.\n"); return err; } static void vrf_setup(struct net_device *dev) { ether_setup(dev); /* Initialize the device structure. */ dev->netdev_ops = &vrf_netdev_ops; dev->l3mdev_ops = &vrf_l3mdev_ops; dev->ethtool_ops = &vrf_ethtool_ops; dev->needs_free_netdev = true; /* Fill in device structure with ethernet-generic values. */ eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->netns_immutable = true; /* does not make sense for a VLAN to be added to a vrf device */ dev->features |= NETIF_F_VLAN_CHALLENGED; /* enable offload features */ dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->hw_features = dev->features; dev->hw_enc_features = dev->features; /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_NO_RX_HANDLER; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; /* VRF devices do not care about MTU, but if the MTU is set * too low then the ipv4 and ipv6 protocols are disabled * which breaks networking. */ dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU; dev->mtu = dev->max_mtu; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG(extack, "Invalid hardware address"); return -EADDRNOTAVAIL; } } return 0; } static void vrf_dellink(struct net_device *dev, struct list_head *head) { struct net_device *port_dev; struct list_head *iter; netdev_for_each_lower_dev(dev, port_dev, iter) vrf_del_slave(dev, port_dev); vrf_map_unregister_dev(dev); unregister_netdevice_queue(dev, head); } static int vrf_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_vrf *vrf = netdev_priv(dev); struct nlattr **data = params->data; struct netns_vrf *nn_vrf; bool *add_fib_rules; struct net *net; int err; if (!data || !data[IFLA_VRF_TABLE]) { NL_SET_ERR_MSG(extack, "VRF table id is missing"); return -EINVAL; } vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); if (vrf->tb_id == RT_TABLE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE], "Invalid VRF table id"); return -EINVAL; } dev->priv_flags |= IFF_L3MDEV_MASTER; err = register_netdevice(dev); if (err) goto out; /* mapping between table_id and vrf; * note: such binding could not be done in the dev init function * because dev->ifindex id is not available yet. */ vrf->ifindex = dev->ifindex; err = vrf_map_register_dev(dev, extack); if (err) { unregister_netdevice(dev); goto out; } net = dev_net(dev); nn_vrf = net_generic(net, vrf_net_id); add_fib_rules = &nn_vrf->add_fib_rules; if (*add_fib_rules) { err = vrf_add_fib_rules(dev); if (err) { vrf_map_unregister_dev(dev); unregister_netdevice(dev); goto out; } *add_fib_rules = false; } out: return err; } static size_t vrf_nl_getsize(const struct net_device *dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ } static int vrf_fillinfo(struct sk_buff *skb, const struct net_device *dev) { struct net_vrf *vrf = netdev_priv(dev); return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); } static size_t vrf_get_slave_size(const struct net_device *bond_dev, const struct net_device *slave_dev) { return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ } static int vrf_fill_slave_info(struct sk_buff *skb, const struct net_device *vrf_dev, const struct net_device *slave_dev) { struct net_vrf *vrf = netdev_priv(vrf_dev); if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) return -EMSGSIZE; return 0; } static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { [IFLA_VRF_TABLE] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vrf_link_ops __read_mostly = { .kind = DRV_NAME, .priv_size = sizeof(struct net_vrf), .get_size = vrf_nl_getsize, .policy = vrf_nl_policy, .validate = vrf_validate, .fill_info = vrf_fillinfo, .get_slave_size = vrf_get_slave_size, .fill_slave_info = vrf_fill_slave_info, .newlink = vrf_newlink, .dellink = vrf_dellink, .setup = vrf_setup, .maxtype = IFLA_VRF_MAX, }; static int vrf_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* only care about unregister events to drop slave references */ if (event == NETDEV_UNREGISTER) { struct net_device *vrf_dev; if (!netif_is_l3_slave(dev)) goto out; vrf_dev = netdev_master_upper_dev_get(dev); vrf_del_slave(vrf_dev, dev); } out: return NOTIFY_DONE; } static struct notifier_block vrf_notifier_block __read_mostly = { .notifier_call = vrf_device_event, }; static int vrf_map_init(struct vrf_map *vmap) { spin_lock_init(&vmap->vmap_lock); hash_init(vmap->ht); vmap->strict_mode = false; return 0; } #ifdef CONFIG_SYSCTL static bool vrf_strict_mode(struct vrf_map *vmap) { bool strict_mode; vrf_map_lock(vmap); strict_mode = vmap->strict_mode; vrf_map_unlock(vmap); return strict_mode; } static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode) { bool *cur_mode; int res = 0; vrf_map_lock(vmap); cur_mode = &vmap->strict_mode; if (*cur_mode == new_mode) goto unlock; if (*cur_mode) { /* disable strict mode */ *cur_mode = false; } else { if (vmap->shared_tables) { /* we cannot allow strict_mode because there are some * vrfs that share one or more tables. */ res = -EBUSY; goto unlock; } /* no tables are shared among vrfs, so we can go back * to 1:1 association between a vrf with its table. */ *cur_mode = true; } unlock: vrf_map_unlock(vmap); return res; } static int vrf_shared_table_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)table->extra1; struct vrf_map *vmap = netns_vrf_map(net); int proc_strict_mode = 0; struct ctl_table tmp = { .procname = table->procname, .data = &proc_strict_mode, .maxlen = sizeof(int), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; int ret; if (!write) proc_strict_mode = vrf_strict_mode(vmap); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode); return ret; } static const struct ctl_table vrf_table[] = { { .procname = "strict_mode", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = vrf_shared_table_handler, /* set by the vrf_netns_init */ .extra1 = NULL, }, }; static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { struct ctl_table *table; table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL); if (!table) return -ENOMEM; /* init the extra1 parameter with the reference to current netns */ table[0].extra1 = net; nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table, ARRAY_SIZE(vrf_table)); if (!nn_vrf->ctl_hdr) { kfree(table); return -ENOMEM; } return 0; } static void vrf_netns_exit_sysctl(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); const struct ctl_table *table; table = nn_vrf->ctl_hdr->ctl_table_arg; unregister_net_sysctl_table(nn_vrf->ctl_hdr); kfree(table); } #else static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf) { return 0; } static void vrf_netns_exit_sysctl(struct net *net) { } #endif /* Initialize per network namespace state */ static int __net_init vrf_netns_init(struct net *net) { struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id); nn_vrf->add_fib_rules = true; vrf_map_init(&nn_vrf->vmap); return vrf_netns_init_sysctl(net, nn_vrf); } static void __net_exit vrf_netns_exit(struct net *net) { vrf_netns_exit_sysctl(net); } static struct pernet_operations vrf_net_ops __net_initdata = { .init = vrf_netns_init, .exit = vrf_netns_exit, .id = &vrf_net_id, .size = sizeof(struct netns_vrf), }; static int __init vrf_init_module(void) { int rc; register_netdevice_notifier(&vrf_notifier_block); rc = register_pernet_subsys(&vrf_net_ops); if (rc < 0) goto error; rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); if (rc < 0) goto unreg_pernet; rc = rtnl_link_register(&vrf_link_ops); if (rc < 0) goto table_lookup_unreg; return 0; table_lookup_unreg: l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF, vrf_ifindex_lookup_by_table_id); unreg_pernet: unregister_pernet_subsys(&vrf_net_ops); error: unregister_netdevice_notifier(&vrf_notifier_block); return rc; } module_init(vrf_init_module); MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK(DRV_NAME); MODULE_VERSION(DRV_VERSION);
5996 14 2372 2224 123 15 1408 13 119 21 12 2153 20 25 18 1 1 207 31 3 11 17 201 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/fanotify.h> #define AUDIT_STATUS_ALL (AUDIT_STATUS_ENABLED | \ AUDIT_STATUS_FAILURE | \ AUDIT_STATUS_PID | \ AUDIT_STATUS_RATE_LIMIT | \ AUDIT_STATUS_BACKLOG_LIMIT | \ AUDIT_STATUS_BACKLOG_WAIT_TIME | \ AUDIT_STATUS_LOST | \ AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) #define AUDIT_INO_UNSET ((u64)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; struct lsm_id; struct lsm_prop; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) #define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_log_nf_skb(struct audit_buffer *ab, const struct sk_buff *skb, u8 nfproto); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { return 0; } static inline int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { return 0; } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline int audit_log_nf_skb(struct audit_buffer *ab, const struct sk_buff *skb, u8 nfproto) { return 0; } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) { } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_uring_entry(u8 op); extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_uring_entry(u8 op) { /* * We intentionally check audit_context() before audit_enabled as most * Linux systems (as of ~2021) rely on systemd which forces audit to * be enabled regardless of the user's audit configuration. */ if (unlikely(audit_context() && audit_enabled)) __audit_uring_entry(op); } static inline void audit_uring_exit(int success, long code) { if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(const char *name); extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_openat2_how(struct open_how *how) { if (unlikely(!audit_dummy_context())) __audit_openat2_how(how); } static inline void audit_log_kern_module(const char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (audit_enabled) __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_uring_entry(u8 op) { } static inline void audit_uring_exit(int success, long code) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_openat2_how(struct open_how *how) { } static inline void audit_log_kern_module(const char *name) { } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif
2009 2012 2008 199 2006 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (kernfs_parent(base)) { kn = kernfs_parent(target); while (kernfs_parent(kn) && base != kn) kn = kernfs_parent(kn); if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = kernfs_parent(base); } /* determine end of target string for reverse fillup */ kn = target; while (kernfs_parent(kn) && kn != base) { len += strlen(kernfs_rcu_name(kn)) + 1; kn = kernfs_parent(kn); } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kernfs_parent(kn) && kn != base) { const char *name = kernfs_rcu_name(kn); int slen = strlen(name); len -= slen; memcpy(s + len, name, slen); if (len) s[--len] = '/'; kn = kernfs_parent(kn); } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(kn); int error; down_read(&root->kernfs_rwsem); parent = kernfs_parent(kn); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
2 2 2 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 /* * linux/fs/hfs/mdb.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains functions for reading/writing the MDB. */ #include <linux/cdrom.h> #include <linux/blkdev.h> #include <linux/nls.h> #include <linux/slab.h> #include "hfs_fs.h" #include "btree.h" /*================ File-local data types ================*/ /* * The HFS Master Directory Block (MDB). * * Also known as the Volume Information Block (VIB), this structure is * the HFS equivalent of a superblock. * * Reference: _Inside Macintosh: Files_ pages 2-59 through 2-62 * * modified for HFS Extended */ static int hfs_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; if (!cdi) return -EINVAL; te.cdte_track = HFS_SB(sb)->session; te.cdte_format = CDROM_LBA; if (cdrom_read_tocentry(cdi, &te) || (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { pr_err("invalid session number or type of track\n"); return -EINVAL; } *start = (sector_t)te.cdte_addr.lba << 2; } else if (cdi) { struct cdrom_multisession ms_info; ms_info.addr_format = CDROM_LBA; if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) *start = (sector_t)ms_info.addr.lba << 2; } return 0; } bool is_hfs_cnid_counts_valid(struct super_block *sb) { struct hfs_sb_info *sbi = HFS_SB(sb); bool corrupted = false; if (unlikely(atomic64_read(&sbi->next_id) > U32_MAX)) { pr_warn("next CNID exceeds limit\n"); corrupted = true; } if (unlikely(atomic64_read(&sbi->file_count) > U32_MAX)) { pr_warn("file count exceeds limit\n"); corrupted = true; } if (unlikely(atomic64_read(&sbi->folder_count) > U32_MAX)) { pr_warn("folder count exceeds limit\n"); corrupted = true; } return !corrupted; } /* * hfs_mdb_get() * * Build the in-core MDB for a filesystem, including * the B-trees and the volume bitmap. */ int hfs_mdb_get(struct super_block *sb) { struct buffer_head *bh; struct hfs_mdb *mdb, *mdb2; unsigned int block; char *ptr; int off2, len, size, sect; sector_t part_start, part_size; loff_t off; __be16 attrib; /* set the device driver to 512-byte blocks */ size = sb_min_blocksize(sb, HFS_SECTOR_SIZE); if (!size) return -EINVAL; if (hfs_get_last_session(sb, &part_start, &part_size)) return -EINVAL; while (1) { /* See if this is an HFS filesystem */ bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) return -EIO; if (mdb->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) break; brelse(bh); /* check for a partition block * (should do this only for cdrom/loop though) */ if (hfs_part_find(sb, &part_start, &part_size)) return -EIO; } HFS_SB(sb)->alloc_blksz = size = be32_to_cpu(mdb->drAlBlkSiz); if (!size || (size & (HFS_SECTOR_SIZE - 1))) { pr_err("bad allocation block size %d\n", size); brelse(bh); return -EIO; } size = min(HFS_SB(sb)->alloc_blksz, (u32)PAGE_SIZE); /* size must be a multiple of 512 */ while (size & (size - 1)) size -= HFS_SECTOR_SIZE; sect = be16_to_cpu(mdb->drAlBlSt) + part_start; /* align block size to first sector */ while (sect & ((size - 1) >> HFS_SECTOR_SIZE_BITS)) size >>= 1; /* align block size to weird alloc size */ while (HFS_SB(sb)->alloc_blksz & (size - 1)) size >>= 1; brelse(bh); if (!sb_set_blocksize(sb, size)) { pr_err("unable to set blocksize to %u\n", size); return -EIO; } bh = sb_bread512(sb, part_start + HFS_MDB_BLK, mdb); if (!bh) return -EIO; if (mdb->drSigWord != cpu_to_be16(HFS_SUPER_MAGIC)) { brelse(bh); return -EIO; } HFS_SB(sb)->mdb_bh = bh; HFS_SB(sb)->mdb = mdb; /* These parameters are read from the MDB, and never written */ HFS_SB(sb)->part_start = part_start; HFS_SB(sb)->fs_ablocks = be16_to_cpu(mdb->drNmAlBlks); HFS_SB(sb)->fs_div = HFS_SB(sb)->alloc_blksz >> sb->s_blocksize_bits; HFS_SB(sb)->clumpablks = be32_to_cpu(mdb->drClpSiz) / HFS_SB(sb)->alloc_blksz; if (!HFS_SB(sb)->clumpablks) HFS_SB(sb)->clumpablks = 1; HFS_SB(sb)->fs_start = (be16_to_cpu(mdb->drAlBlSt) + part_start) >> (sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS); /* These parameters are read from and written to the MDB */ HFS_SB(sb)->free_ablocks = be16_to_cpu(mdb->drFreeBks); atomic64_set(&HFS_SB(sb)->next_id, be32_to_cpu(mdb->drNxtCNID)); HFS_SB(sb)->root_files = be16_to_cpu(mdb->drNmFls); HFS_SB(sb)->root_dirs = be16_to_cpu(mdb->drNmRtDirs); atomic64_set(&HFS_SB(sb)->file_count, be32_to_cpu(mdb->drFilCnt)); atomic64_set(&HFS_SB(sb)->folder_count, be32_to_cpu(mdb->drDirCnt)); if (!is_hfs_cnid_counts_valid(sb)) { pr_warn("filesystem possibly corrupted, running fsck.hfs is recommended. Mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } /* TRY to get the alternate (backup) MDB. */ sect = part_start + part_size - 2; bh = sb_bread512(sb, sect, mdb2); if (bh) { if (mdb2->drSigWord == cpu_to_be16(HFS_SUPER_MAGIC)) { HFS_SB(sb)->alt_mdb_bh = bh; HFS_SB(sb)->alt_mdb = mdb2; } else brelse(bh); } if (!HFS_SB(sb)->alt_mdb) { pr_warn("unable to locate alternate MDB\n"); pr_warn("continuing without an alternate MDB\n"); } HFS_SB(sb)->bitmap = kzalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) return -EIO; /* read in the bitmap */ block = be16_to_cpu(mdb->drVBMSt) + part_start; off = (loff_t)block << HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 8) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, off >> sb->s_blocksize_bits); if (!bh) { pr_err("unable to read volume bitmap\n"); return -EIO; } off2 = off & (sb->s_blocksize - 1); len = min((int)sb->s_blocksize - off2, size); memcpy(ptr, bh->b_data + off2, len); brelse(bh); ptr += len; off += len; size -= len; } HFS_SB(sb)->ext_tree = hfs_btree_open(sb, HFS_EXT_CNID, hfs_ext_keycmp); if (!HFS_SB(sb)->ext_tree) { pr_err("unable to open extent tree\n"); return -EIO; } HFS_SB(sb)->cat_tree = hfs_btree_open(sb, HFS_CAT_CNID, hfs_cat_keycmp); if (!HFS_SB(sb)->cat_tree) { pr_err("unable to open catalog tree\n"); return -EIO; } attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. Mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { pr_warn("filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= SB_RDONLY; } if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); mdb->drAtrb = attrib; be32_add_cpu(&mdb->drWrCnt, 1); mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); sync_dirty_buffer(HFS_SB(sb)->mdb_bh); } return 0; } /* * hfs_mdb_commit() * * Description: * This updates the MDB on disk. * It does not check, if the superblock has been modified, or * if the filesystem has been mounted read-only. It is mainly * called by hfs_sync_fs() and flush_mdb(). * Input Variable(s): * struct hfs_mdb *mdb: Pointer to the hfs MDB * int backup; * Output Variable(s): * NONE * Returns: * void * Preconditions: * 'mdb' points to a "valid" (struct hfs_mdb). * Postconditions: * The HFS MDB and on disk will be updated, by copying the possibly * modified fields from the in memory MDB (in native byte order) to * the disk block buffer. * If 'backup' is non-zero then the alternate MDB is also written * and the function doesn't return until it is actually on disk. */ void hfs_mdb_commit(struct super_block *sb) { struct hfs_mdb *mdb = HFS_SB(sb)->mdb; if (sb_rdonly(sb)) return; lock_buffer(HFS_SB(sb)->mdb_bh); if (test_and_clear_bit(HFS_FLG_MDB_DIRTY, &HFS_SB(sb)->flags)) { /* These parameters may have been modified, so write them back */ mdb->drLsMod = hfs_mtime(); mdb->drFreeBks = cpu_to_be16(HFS_SB(sb)->free_ablocks); mdb->drNxtCNID = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->next_id)); mdb->drNmFls = cpu_to_be16(HFS_SB(sb)->root_files); mdb->drNmRtDirs = cpu_to_be16(HFS_SB(sb)->root_dirs); mdb->drFilCnt = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->file_count)); mdb->drDirCnt = cpu_to_be32((u32)atomic64_read(&HFS_SB(sb)->folder_count)); /* write MDB to disk */ mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* write the backup MDB, not returning until it is written. * we only do this when either the catalog or extents overflow * files grow. */ if (test_and_clear_bit(HFS_FLG_ALT_MDB_DIRTY, &HFS_SB(sb)->flags) && HFS_SB(sb)->alt_mdb) { hfs_inode_write_fork(HFS_SB(sb)->ext_tree->inode, mdb->drXTExtRec, &mdb->drXTFlSize, NULL); hfs_inode_write_fork(HFS_SB(sb)->cat_tree->inode, mdb->drCTExtRec, &mdb->drCTFlSize, NULL); lock_buffer(HFS_SB(sb)->alt_mdb_bh); memcpy(HFS_SB(sb)->alt_mdb, HFS_SB(sb)->mdb, HFS_SECTOR_SIZE); HFS_SB(sb)->alt_mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->alt_mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); unlock_buffer(HFS_SB(sb)->alt_mdb_bh); mark_buffer_dirty(HFS_SB(sb)->alt_mdb_bh); sync_dirty_buffer(HFS_SB(sb)->alt_mdb_bh); } if (test_and_clear_bit(HFS_FLG_BITMAP_DIRTY, &HFS_SB(sb)->flags)) { struct buffer_head *bh; sector_t block; char *ptr; int off, size, len; block = be16_to_cpu(HFS_SB(sb)->mdb->drVBMSt) + HFS_SB(sb)->part_start; off = (block << HFS_SECTOR_SIZE_BITS) & (sb->s_blocksize - 1); block >>= sb->s_blocksize_bits - HFS_SECTOR_SIZE_BITS; size = (HFS_SB(sb)->fs_ablocks + 7) / 8; ptr = (u8 *)HFS_SB(sb)->bitmap; while (size) { bh = sb_bread(sb, block); if (!bh) { pr_err("unable to read volume bitmap\n"); break; } len = min((int)sb->s_blocksize - off, size); lock_buffer(bh); memcpy(bh->b_data + off, ptr, len); unlock_buffer(bh); mark_buffer_dirty(bh); brelse(bh); block++; off = 0; ptr += len; size -= len; } } unlock_buffer(HFS_SB(sb)->mdb_bh); } void hfs_mdb_close(struct super_block *sb) { /* update volume attributes */ if (sb_rdonly(sb)) return; HFS_SB(sb)->mdb->drAtrb |= cpu_to_be16(HFS_SB_ATTRIB_UNMNT); HFS_SB(sb)->mdb->drAtrb &= cpu_to_be16(~HFS_SB_ATTRIB_INCNSTNT); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); } /* * hfs_mdb_put() * * Release the resources associated with the in-core MDB. */ void hfs_mdb_put(struct super_block *sb) { /* free the B-trees */ hfs_btree_close(HFS_SB(sb)->ext_tree); hfs_btree_close(HFS_SB(sb)->cat_tree); /* free the buffers holding the primary and alternate MDBs */ brelse(HFS_SB(sb)->mdb_bh); brelse(HFS_SB(sb)->alt_mdb_bh); unload_nls(HFS_SB(sb)->nls_io); unload_nls(HFS_SB(sb)->nls_disk); kfree(HFS_SB(sb)->bitmap); }
3 3 2 3 2 2 2 3 6 5 1 1 1 10 1 3 6 2 2 3 1 10 10 4 4 2 2 4 4 16 17 17 1 1 1 1 1 1 1 1 2 2 1 1 1 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists * @mesh_iface: netdev struct of the mesh interface * * If the given mesh interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface) { struct net_device *upper = mesh_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !netif_is_bridge_master(upper)); dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * * Checks the presence of an IPv4 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_MFORWARD(in_dev)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR4; } /** * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * * Checks the presence of an IPv6 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; } #else static inline u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev); flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev); rcu_read_unlock(); return flags; } /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) flags |= BATADV_MCAST_WANT_NO_RTR4; if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags * @bat_priv: the bat priv with all the mesh interface information * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node or behind its bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, struct net_device *bridge) { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; } /** * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags * @bat_priv: the bat priv with all the mesh interface information * * Checks if all active hard interfaces have an MTU larger or equal to 1280 * bytes (IPv6 minimum MTU). * * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise. */ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) { const struct batadv_hard_iface *hard_iface; struct list_head *iter; rcu_read_lock(); netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) { if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { rcu_read_unlock(); return BATADV_NO_FLAGS; } } rcu_read_unlock(); return BATADV_MCAST_HAVE_MC_PTYPE_CAPA; } /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the mesh interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. */ static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { struct net_device *dev = bat_priv->mesh_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv); if (!bridge) return mla_flags; dev_put(bridge); mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; } if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; } return mla_flags; } /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct in_device *in_dev; u8 mcast_addr[ETH_ALEN]; struct ip_mc_list *pmc; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) return 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return 0; } for (pmc = rcu_dereference(in_dev->mc_list); pmc; pmc = rcu_dereference(pmc->next_rcu)) { if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(pmc->multiaddr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(pmc->multiaddr)) continue; ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } /** * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ #if IS_ENABLED(CONFIG_IPV6) static int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct inet6_dev *in6_dev; u8 mcast_addr[ETH_ALEN]; struct ifmcaddr6 *pmc6; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) return 0; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) { rcu_read_unlock(); return 0; } for (pmc6 = rcu_dereference(in6_dev->mc_list); pmc6; pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } #else static inline int batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { return 0; } #endif /** * batadv_mcast_mla_meshif_get() - get meshif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_meshif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct net_device *bridge = batadv_mcast_get_bridge(dev); int ret4, ret6 = 0; if (bridge) dev = bridge; ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; } out: dev_put(bridge); return ret4 + ret6; } /** * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given mesh interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } #if IS_ENABLED(CONFIG_IPV6) if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } #endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc_obj(*new, GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->mesh_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the mesh interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * mesh interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->mesh_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->mesh_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the mesh interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a mesh interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *new_flags) { struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &old_flags->querier_ipv4, &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &old_flags->querier_ipv6, &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the mesh interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; char str_old_flags[] = "[.... . .]"; sprintf(str_old_flags, "[%c%c%c%s%s%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ", !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.'); } /** * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the mesh interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. */ static void batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) return; batadv_mcast_bridge_log(bat_priv, flags); batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); mcast_data.flags = flags->tvlv_flags; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.mla_flags = *flags; } /** * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the mesh interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *mesh_iface = bat_priv->mesh_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); batadv_mcast_mla_flags_update(bat_priv, &flags); spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); __batadv_mcast_mla_update(bat_priv); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ if (ipv4_is_local_multicast(iphdr->daddr)) *is_unsnoopable = true; else *is_routable = ETH_P_IP; return 0; } /** * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) *is_routable = ETH_P_IPV6; return 0; } /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable, is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable, is_routable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router * @bat_priv: the bat priv with all the mesh interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic * if the protocol is ETH_P_IP or the number of nodes which want all routable * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. */ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, int protocol) { switch (protocol) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_rtr4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_rtr6); default: return 0; } } /** * batadv_mcast_forw_mode_by_count() - get forwarding mode by count * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * @count: the number of originators the multicast packet need to be sent to * * For a multicast packet with multiple destination originators, checks which * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a * complete batman-adv multicast header. * * Return: * BATADV_FORW_MCAST: If all nodes have multicast packet routing * capabilities and an MTU >= 1280 on all hard interfaces (including us) * and the encapsulated multicast packet with all destination addresses * would still fit into an 1280 bytes batman-adv multicast packet * (excluding the outer ethernet frame) and we could successfully push * the full batman-adv multicast packet header. * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv * multicast packet and the amount of batman-adv unicast packets needed * is smaller or equal to the configured multicast fanout. * BATADV_FORW_BCAST: Otherwise. */ static enum batadv_forw_mode batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable, int count) { unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count); u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags; if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) && own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && skb->len + mcast_hdrlen <= IPV6_MIN_MTU && batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count)) return BATADV_FORW_MCAST; if (count <= atomic_read(&bat_priv->multicast_fanout)) return BATADV_FORW_UCASTS; return BATADV_FORW_BCAST; } /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Return: The forwarding mode as enum batadv_forw_mode. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; struct ethhdr *ethhdr; int rtr_count = 0; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_BCAST; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable); total_count = tt_count + ip_count + unsnoop_count + rtr_count; if (!total_count) return BATADV_FORW_NONE; else if (unsnoop_count) return BATADV_FORW_BCAST; return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable, total_count); } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any multicast * listener registered in the translation table. A transmission is performed * via a batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; struct batadv_tt_orig_list_entry *orig_entry; struct batadv_tt_global_entry *tt_global; const u8 *addr = eth_hdr(skb)->h_dest; tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global) goto out; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_entry->orig_node); } rcu_read_unlock(); batadv_tt_global_entry_put(tt_global); out: return ret; } /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Sends copies of a frame with multicast destination to any node that signaled * interest in it, that is either via the translation table or the according * want-all flags. A transmission is performed via a batman-adv unicast packet * for each such destination node. * * The given skb is consumed/freed. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable) { int ret; ret = batadv_mcast_forw_tt(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } if (!is_routable) goto skip_mc_router; ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } skip_mc_router: consume_skb(skb); return ret; } /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates the counter and the list * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr4_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { atomic_inc(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr6_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { atomic_inc(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has * toggled then this method updates the counter accordingly. */ static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) && orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa); /* switched from flag unset to set */ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA && !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)) atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa); } /** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags * @tvlv_value_len: tvlv buffer length * * Return: multicast flags for the given tvlv buffer */ static u8 batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) { u8 mcast_flags = BATADV_NO_FLAGS; if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; if (!enabled) { mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; } /* remove redundant flags to avoid sending duplicate packets later */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; return mcast_flags; } /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags; mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } else if (!orig_mcast_enabled && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); batadv_tvlv_handler_register(bat_priv, NULL, NULL, batadv_mcast_forw_tracker_tvlv_handler, BATADV_TVLV_MCAST_TRACKER, 1, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 or error code. */ int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) return -EMSGSIZE; return 0; } /** * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, orig_node->mcast_flags)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } } genlmsg_end(msg, hdr); return 0; } /** * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags * table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (idx < *idx_skip) goto skip; if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @bat_priv: the bat priv with all the mesh interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * * Return: 0 or error code. */ static int __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; long idx_tmp = *idx; while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, bucket_tmp, &idx_tmp)) break; bucket_tmp++; idx_tmp = 0; } *bucket = bucket_tmp; *idx = idx_tmp; return msg->len; } /** * batadv_mcast_netlink_get_primary() - get primary interface from netlink * callback * @cb: netlink callback structure * @primary_if: the primary interface pointer to return the result in * * Return: 0 or error code. */ static int batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; struct net_device *mesh_iface; struct batadv_priv *bat_priv; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } out: dev_put(mesh_iface); if (!ret && primary_if) *primary_if = hard_iface; else batadv_hardif_put(hard_iface); return ret; } /** * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct batadv_priv *bat_priv; long *bucket = &cb->args[0]; long *idx = &cb->args[1]; int ret; ret = batadv_mcast_netlink_get_primary(cb, &primary_if); if (ret) return ret; bat_priv = netdev_priv(primary_if->mesh_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; } /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); batadv_mcast_have_mc_ptype_update(bat_priv, orig, BATADV_MCAST_HAVE_MC_PTYPE_CAPA); spin_unlock_bh(&orig->mcast_handler_lock); }
3 3 2 9 1 1 7 2 9 9 4 5 5 3 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/netfilter/xt_IDLETIMER.c * * Netfilter module to trigger a timer when packet matches. * After timer expires a kevent will be sent. * * Copyright (C) 2004, 2010 Nokia Corporation * Written by Timo Teras <ext-timo.teras@nokia.com> * * Converted to x_tables and reworked for upstream inclusion * by Luciano Coelho <luciano.coelho@nokia.com> * * Contact: Luciano Coelho <luciano.coelho@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/timer.h> #include <linux/alarmtimer.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_IDLETIMER.h> #include <linux/kdev_t.h> #include <linux/kobject.h> #include <linux/workqueue.h> #include <linux/sysfs.h> struct idletimer_tg { struct list_head entry; struct alarm alarm; struct timer_list timer; struct work_struct work; struct kobject *kobj; struct device_attribute attr; unsigned int refcnt; u8 timer_type; }; static LIST_HEAD(idletimer_tg_list); static DEFINE_MUTEX(list_mutex); static struct kobject *idletimer_tg_kobj; static struct idletimer_tg *__idletimer_tg_find_by_label(const char *label) { struct idletimer_tg *entry; list_for_each_entry(entry, &idletimer_tg_list, entry) { if (!strcmp(label, entry->attr.attr.name)) return entry; } return NULL; } static ssize_t idletimer_tg_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idletimer_tg *timer; unsigned long expires = 0; struct timespec64 ktimespec = {}; long time_diff = 0; mutex_lock(&list_mutex); timer = __idletimer_tg_find_by_label(attr->attr.name); if (timer) { if (timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t expires_alarm = alarm_expires_remaining(&timer->alarm); ktimespec = ktime_to_timespec64(expires_alarm); time_diff = ktimespec.tv_sec; } else { expires = timer->timer.expires; time_diff = jiffies_to_msecs(expires - jiffies) / 1000; } } mutex_unlock(&list_mutex); if (time_after(expires, jiffies) || ktimespec.tv_sec > 0) return sysfs_emit(buf, "%ld\n", time_diff); return sysfs_emit(buf, "0\n"); } static void idletimer_tg_work(struct work_struct *work) { struct idletimer_tg *timer = container_of(work, struct idletimer_tg, work); sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name); } static void idletimer_tg_expired(struct timer_list *t) { struct idletimer_tg *timer = timer_container_of(timer, t, timer); pr_debug("timer %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) { struct idletimer_tg *timer = alarm->data; pr_debug("alarm %s expired\n", timer->attr.attr.name); schedule_work(&timer->work); } static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; ret = xt_check_proc_name(name, size); if (ret < 0) return ret; if (!strcmp(name, "power") || !strcmp(name, "subsystem") || !strcmp(name, "uevent")) return -EINVAL; return 0; } static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; info->timer = kzalloc_obj(*info->timer); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } list_add(&info->timer->entry, &idletimer_tg_list); timer_setup(&info->timer->timer, idletimer_tg_expired, 0); info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) { int ret; info->timer = kmalloc_obj(*info->timer); if (!info->timer) { ret = -ENOMEM; goto out; } ret = idletimer_check_sysfs_name(info->label, sizeof(info->label)); if (ret < 0) goto out_free_timer; sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; goto out_free_timer; } info->timer->attr.attr.mode = 0444; info->timer->attr.show = idletimer_tg_show; ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr); if (ret < 0) { pr_debug("couldn't add file to sysfs"); goto out_free_attr; } /* notify userspace */ kobject_uevent(idletimer_tg_kobj,KOBJ_ADD); list_add(&info->timer->entry, &idletimer_tg_list); pr_debug("timer type value is %u", info->timer_type); info->timer->timer_type = info->timer_type; info->timer->refcnt = 1; INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return 0; out_free_attr: kfree(info->timer->attr.attr.name); out_free_timer: kfree(info->timer); out: return ret; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } /* * The actual xt_tables plugin. */ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("resetting timer %s, timeout period %u\n", info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { ktime_t tout = ktime_set(info->timeout, 0); alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; } static int idletimer_tg_helper(struct idletimer_tg_info *info) { if (info->timeout == 0) { pr_debug("timeout value is zero\n"); return -EINVAL; } if (info->timeout >= INT_MAX / 1000) { pr_debug("timeout value is too big\n"); return -EINVAL; } if (info->label[0] == '\0' || strnlen(info->label, MAX_IDLETIMER_LABEL_SIZE) == MAX_IDLETIMER_LABEL_SIZE) { pr_debug("label is empty or not nul-terminated\n"); return -EINVAL; } return 0; } static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) { struct idletimer_tg_info *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); ret = idletimer_tg_helper(info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type & XT_IDLETIMER_ALARM) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) { struct idletimer_tg_info_v1 *info = par->targinfo; int ret; pr_debug("checkentry targinfo%s\n", info->label); if (info->send_nl_msg) return -EOPNOTSUPP; ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { pr_debug("checkentry helper return invalid\n"); return -EINVAL; } if (info->timer_type > XT_IDLETIMER_ALARM) { pr_debug("invalid value for timer type\n"); return -EINVAL; } mutex_lock(&list_mutex); info->timer = __idletimer_tg_find_by_label(info->label); if (info->timer) { if (info->timer->timer_type != info->timer_type) { pr_debug("Adding/Replacing rule with same label and different timer type is not allowed\n"); mutex_unlock(&list_mutex); return -EINVAL; } info->timer->refcnt++; if (info->timer_type & XT_IDLETIMER_ALARM) { /* calculate remaining expiry time */ ktime_t tout = alarm_expires_remaining(&info->timer->alarm); struct timespec64 ktimespec = ktime_to_timespec64(tout); if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); alarm_start_relative(&info->timer->alarm, tout); } } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); } else { ret = idletimer_tg_create_v1(info); if (ret < 0) { pr_debug("failed to create timer\n"); mutex_unlock(&list_mutex); return ret; } } mutex_unlock(&list_mutex); return 0; } static void idletimer_tg_destroy(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); timer_shutdown_sync(&info->timer->timer); cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static void idletimer_tg_destroy_v1(const struct xt_tgdtor_param *par) { const struct idletimer_tg_info_v1 *info = par->targinfo; pr_debug("destroy targinfo %s\n", info->label); mutex_lock(&list_mutex); if (--info->timer->refcnt > 0) { pr_debug("decreased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); mutex_unlock(&list_mutex); return; } pr_debug("deleting timer %s\n", info->label); list_del(&info->timer->entry); mutex_unlock(&list_mutex); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { alarm_cancel(&info->timer->alarm); } else { timer_shutdown_sync(&info->timer->timer); } cancel_work_sync(&info->timer->work); sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr); kfree(info->timer->attr.attr.name); kfree(info->timer); } static struct xt_target idletimer_tg[] __read_mostly = { { .name = "IDLETIMER", .family = NFPROTO_IPV4, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV4, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "IDLETIMER", .family = NFPROTO_IPV6, .target = idletimer_tg_target, .targetsize = sizeof(struct idletimer_tg_info), .usersize = offsetof(struct idletimer_tg_info, timer), .checkentry = idletimer_tg_checkentry, .destroy = idletimer_tg_destroy, .me = THIS_MODULE, }, { .name = "IDLETIMER", .family = NFPROTO_IPV6, .revision = 1, .target = idletimer_tg_target_v1, .targetsize = sizeof(struct idletimer_tg_info_v1), .usersize = offsetof(struct idletimer_tg_info_v1, timer), .checkentry = idletimer_tg_checkentry_v1, .destroy = idletimer_tg_destroy_v1, .me = THIS_MODULE, }, #endif }; static struct class *idletimer_tg_class; static struct device *idletimer_tg_device; static int __init idletimer_tg_init(void) { int err; idletimer_tg_class = class_create("xt_idletimer"); err = PTR_ERR(idletimer_tg_class); if (IS_ERR(idletimer_tg_class)) { pr_debug("couldn't register device class\n"); goto out; } idletimer_tg_device = device_create(idletimer_tg_class, NULL, MKDEV(0, 0), NULL, "timers"); err = PTR_ERR(idletimer_tg_device); if (IS_ERR(idletimer_tg_device)) { pr_debug("couldn't register system device\n"); goto out_class; } idletimer_tg_kobj = &idletimer_tg_device->kobj; err = xt_register_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); if (err < 0) { pr_debug("couldn't register xt target\n"); goto out_dev; } return 0; out_dev: device_destroy(idletimer_tg_class, MKDEV(0, 0)); out_class: class_destroy(idletimer_tg_class); out: return err; } static void __exit idletimer_tg_exit(void) { xt_unregister_targets(idletimer_tg, ARRAY_SIZE(idletimer_tg)); device_destroy(idletimer_tg_class, MKDEV(0, 0)); class_destroy(idletimer_tg_class); } module_init(idletimer_tg_init); module_exit(idletimer_tg_exit); MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS("ipt_IDLETIMER"); MODULE_ALIAS("ip6t_IDLETIMER");
8 1 8 8 9 8 12 1 7 6 8 12 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 // SPDX-License-Identifier: GPL-2.0 /* * ext4.h * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/include/linux/minix_fs.h * * Copyright (C) 1991, 1992 Linus Torvalds */ #ifndef _EXT4_H #define _EXT4_H #include <linux/refcount.h> #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> #include <linux/jbd2.h> #include <linux/quota.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif #include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/compiler.h> /* * The fourth extended filesystem constants/structures */ /* * with AGGRESSIVE_CHECK allocator runs consistency checks over * structures. these checks slow things down a lot */ #define AGGRESSIVE_CHECK__ /* * with DOUBLE_CHECK defined mballoc creates persistent in-core * bitmaps, maintains and uses them to check for double allocations */ #define DOUBLE_CHECK__ /* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG /* * Debug code */ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c */ #define EXT_DEBUG__ /* * Dynamic printk for controlled extents debugging. */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): ino %llu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) #else #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define ASSERT(assert) \ do { \ if (unlikely(!(assert))) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: '%s'\n", \ __func__, __FILE__, __LINE__, #assert); \ BUG(); \ } \ } while (0) /* data type for block offset of block group */ typedef int ext4_grpblk_t; /* data type for filesystem-wide blocks number */ typedef unsigned long long ext4_fsblk_t; /* data type for file logical block number */ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; enum SHIFT_DIRECTION { SHIFT_LEFT = 0, SHIFT_RIGHT, }; /* * For each criteria, mballoc has slightly different way of finding * the required blocks nad usually, higher the criteria the slower the * allocation. We start at lower criterias and keep falling back to * higher ones if we are not able to find any blocks. Lower (earlier) * criteria are faster. */ enum criteria { /* * Used when number of blocks needed is a power of 2. This * doesn't trigger any disk IO except prefetch and is the * fastest criteria. */ CR_POWER2_ALIGNED, /* * Tries to lookup in-memory data structures to find the most * suitable group that satisfies goal request. No disk IO * except block prefetch. */ CR_GOAL_LEN_FAST, /* * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal * length to the best available length for faster allocation. */ CR_BEST_AVAIL_LEN, /* * Reads each block group sequentially, performing disk IO if * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ CR_GOAL_LEN_SLOW, /* * Finds the first free set of blocks and allocates * those. This is only used in rare cases when * CR_GOAL_LEN_SLOW also fails to allocate anything. */ CR_ANY_FREE, /* * Number of criterias defined. */ EXT4_MB_NUM_CRS }; /* * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ #define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ #define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ #define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; /* how many blocks we want to allocate */ unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; /* phys. target (a hint) */ ext4_fsblk_t goal; /* phys. block for the closest logical allocated block to the left */ ext4_fsblk_t pleft; /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; /* * Logical to physical block mapping, used by ext4_map_blocks() * * This structure is used to pass requests into ext4_map_blocks() as * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ #define EXT4_MAP_NEW BIT(BH_New) #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) /* * This is for use in ext4_map_query_blocks() for a special case where we can * have a physically and logically contiguous blocks split across two leaf * nodes instead of a single extent. This is required in case of atomic writes * to know whether the returned extent is last in leaf. If yes, then lookup for * next in leaf block in ext4_map_query_blocks_next_in_leaf(). * - This is never going to be added to any buffer head state. * - We use the next available bit after BH_BITMAP_UPTODATE. */ #define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; u64 m_seq; }; /* * Block validity checking, system zone rbtree. */ struct ext4_system_blocks { struct rb_root root; struct rcu_head rcu; }; /* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_FAILED 0x0002 #define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ }; /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ handle_t *handle; /* handle reserved for extent * conversion */ struct inode *inode; /* file being written to */ struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ refcount_t count; /* reference counter */ struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; }; /* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ #define EXT4_USR_QUOTA_INO 3 /* User quota inode */ #define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ #define EXT4_JOURNAL_INO 8 /* Journal inode */ /* First non-reserved inode for old ext4 filesystems */ #define EXT4_GOOD_OLD_FIRST_INO 11 /* * Maximal count of links to a file */ #define EXT4_LINK_MAX 65000 /* * Macro-instructions used to manage several block sizes */ #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 #define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) # define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif #ifdef __KERNEL__ #define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) #define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) #define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) #else #define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_INODE_SIZE : \ (s)->s_inode_size) #define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) #define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) /* Translate a block number to a page index */ #define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ PAGE_SHIFT) /* Translate a page index to a block number */ #define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ #define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) /* Mask out the low bits to get the starting block of the cluster */ #define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* Fill in the low bits to get the last block of the cluster */ #define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor */ struct ext4_group_desc { __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ __le16 bg_free_blocks_count_lo;/* Free blocks count */ __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ __le32 bg_inode_table_hi; /* Inodes table block MSB */ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ __u32 bg_reserved; }; #define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ sizeof(__le16)) #define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ sizeof(__le16)) /* * Structure of a flex block group info */ struct flex_groups { atomic64_t free_clusters; atomic_t free_inodes; atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ /* * Macro-instructions used to manage group descriptors */ #define EXT4_MIN_DESC_SIZE 32 #define EXT4_MIN_DESC_SIZE_64BIT 64 #define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) # define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) #else # define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) # define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) #endif /* * Constants relative to the data blocks */ #define EXT4_NDIR_BLOCKS 12 #define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS #define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) #define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) #define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) /* * Inode flags */ #define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ #define EXT4_UNRM_FL 0x00000002 /* Undelete */ #define EXT4_COMPR_FL 0x00000004 /* Compress file */ #define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ #define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ #define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ #define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ /* Reserved for compression usage... */ #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ /* nb: was previously EXT2_ECOMPR_FL */ #define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ /* User modifiable flags */ #define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ EXT4_UNRM_FL | \ EXT4_COMPR_FL | \ EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ EXT4_JOURNAL_DATA_FL | \ EXT4_NOTAIL_FL | \ EXT4_DIRSYNC_FL | \ EXT4_TOPDIR_FL | \ EXT4_EXTENTS_FL | \ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ EXT4_DAX_FL | \ EXT4_PROJINHERIT_FL | \ EXT4_CASEFOLD_FL) /* User visible flags */ #define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ EXT4_DIRTY_FL | \ EXT4_COMPRBLK_FL | \ EXT4_NOCOMPR_FL | \ EXT4_ENCRYPT_FL | \ EXT4_INDEX_FL | \ EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ EXT4_PROJINHERIT_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) /* Flags which are mutually exclusive to DAX */ #define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & EXT4_REG_FLMASK; else return flags & EXT4_OTHER_FLMASK; } /* * Inode flags used for atomic set/get */ enum { EXT4_INODE_SECRM = 0, /* Secure deletion */ EXT4_INODE_UNRM = 1, /* Undelete */ EXT4_INODE_COMPR = 2, /* Compress file */ EXT4_INODE_SYNC = 3, /* Synchronous updates */ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ EXT4_INODE_APPEND = 5, /* writes to file may only append */ EXT4_INODE_NODUMP = 6, /* do not dump file */ EXT4_INODE_NOATIME = 7, /* do not update atime */ /* Reserved for compression usage... */ EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; /* * Since it's pretty easy to mix up bit numbers and hex values, we use a * build-time check to make sure that EXT4_XXX_FL is consistent with respect to * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost * any extra space in the compiled kernel image, otherwise, the build will fail. * It's important that these values are the same, since we are using * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk * values found in ext2, ext3 and ext4 filesystems, and of course the values * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); CHECK_FLAG_VALUE(UNRM); CHECK_FLAG_VALUE(COMPR); CHECK_FLAG_VALUE(SYNC); CHECK_FLAG_VALUE(IMMUTABLE); CHECK_FLAG_VALUE(APPEND); CHECK_FLAG_VALUE(NODUMP); CHECK_FLAG_VALUE(NOATIME); CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); CHECK_FLAG_VALUE(NOTAIL); CHECK_FLAG_VALUE(DIRSYNC); CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; compat_u64 block_bitmap; compat_u64 inode_bitmap; compat_u64 inode_table; u32 blocks_count; u16 reserved_blocks; u16 unused; }; #endif /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; __u64 block_bitmap; __u64 inode_bitmap; __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; __u16 mdata_blocks; __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ enum { BLOCK_BITMAP = 0, /* block bitmap */ INODE_BITMAP, /* inode bitmap */ INODE_TABLE, /* inode tables */ GROUP_TABLE_COUNT, }; /* * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unwritten extent */ #define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* * This means that we cannot merge newly allocated extents, and if we * found an unwritten extent, we need to split it. */ #define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 /* Convert unwritten extent to initialized. */ #define EXT4_GET_BLOCKS_CONVERT 0x0010 /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) /* Caller is in the context of data submission, such as writeback, * fsync, etc. Especially, in the generic writeback path, caller will * submit data before dropping transaction handle. This allows jbd2 * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * Atomic write caller needs this to query in the slow path of mixed mapping * case, when a contiguous extent can be split across two adjacent leaf nodes. * Look EXT4_MAP_QUERY_LAST_IN_LEAF. */ #define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 /* * ext4_map_query_blocks() uses this filter mask to filter the flags needed to * pass while lookup/querying of on disk extent tree. */ #define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ EXT4_EX_NOFAIL |\ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ #define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk */ struct ext4_inode { __le16 i_mode; /* File mode */ __le16 i_uid; /* Low 16 bits of Owner Uid */ __le32 i_size_lo; /* Size in bytes */ __le32 i_atime; /* Access time */ __le32 i_ctime; /* Inode Change time */ __le32 i_mtime; /* Modification time */ __le32 i_dtime; /* Deletion Time */ __le16 i_gid; /* Low 16 bits of Group Id */ __le16 i_links_count; /* Links count */ __le32 i_blocks_lo; /* Blocks count */ __le32 i_flags; /* File flags */ union { struct { __le32 l_i_version; } linux1; struct { __u32 h_i_translator; } hurd1; struct { __u32 m_i_reserved1; } masix1; } osd1; /* OS dependent 1 */ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ __le32 i_generation; /* File version (for NFS) */ __le32 i_file_acl_lo; /* File ACL */ __le32 i_size_high; __le32 i_obso_faddr; /* Obsoleted fragment address */ union { struct { __le16 l_i_blocks_high; /* were l_i_reserved1 */ __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __u16 h_i_mode_high; __u16 h_i_uid_high; __u16 h_i_gid_high; __u32 h_i_author; } hurd2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __le16 m_i_file_acl_high; __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ __le32 i_projid; /* Project ID */ }; #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) /* * Extended fields will fit into an inode if the filesystem was formatted * with large inodes (-I 256 or larger) and there are not currently any EAs * consuming all of the available space. For new inodes we always reserve * enough space for the kernel's known extended fields, but for inodes * created with an old kernel this might not have been the case. None of * the extended inode fields is critical for correct filesystem operation. * This macro checks if a certain field fits in the inode. Note that * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize */ #define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ ((offsetof(typeof(*ext4_inode), field) + \ sizeof((ext4_inode)->field)) \ <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ /* * We use an encoding that preserves the times for extra epoch "00": * * extra msb of adjust for signed * epoch 32-bit 32-bit tv_sec to * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 * * Note that previous versions of the kernel on 64-bit systems would * incorrectly use extra epoch bits 1,1 for dates between 1901 and * 1970. e2fsck will correct this, assuming that it is run on the * affected filesystem before 2242. */ static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } static inline struct timespec64 ext4_decode_extra_time(__le32 base, __le32 extra) { struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; return ts; } #define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ } else \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) #define EXT4_INODE_SET_ATIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) #define EXT4_INODE_SET_MTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) #define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ raw_inode, (einode)->xtime) #define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ ext4_decode_extra_time((raw_inode)->xtime, \ (raw_inode)->xtime ## _extra) : \ (struct timespec64) { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) #define EXT4_INODE_GET_ATIME(inode, raw_inode) \ do { \ inode_set_atime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ inode_set_mtime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ do { \ inode_set_ctime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime = \ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ raw_inode); \ else \ (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version #if defined(__KERNEL__) || defined(__linux__) #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_file_acl_high osd2.linux2.l_i_file_acl_high #define i_blocks_high osd2.linux2.l_i_blocks_high #define i_uid_low i_uid #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high #define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) #define i_translator osd1.hurd1.h_i_translator #define i_uid_high osd2.hurd2.h_i_uid_high #define i_gid_high osd2.hurd2.h_i_gid_high #define i_author osd2.hurd2.h_i_author #elif defined(__masix__) #define i_reserved1 osd1.masix1.m_i_reserved1 #define i_file_acl_high osd2.masix2.m_i_file_acl_high #define i_reserved2 osd2.masix2.m_i_reserved2 #endif /* defined(__KERNEL__) || defined(__linux__) */ #include "extents_status.h" #include "fast_commit.h" /* * Lock subclasses for i_data_sem in the ext4_inode_info structure. * * These are needed to avoid lockdep false positives when we need to * allocate blocks to the quota inode during ext4_map_blocks(), while * holding i_data_sem for a normal (non-quota) inode. Since we don't * do quota tracking for the quota inode, this avoids deadlock (as * well as infinite recursion, since it isn't turtles all the way * down...) * * I_DATA_SEM_NORMAL - Used for most inodes * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode * where the second inode has larger inode number * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only * I_DATA_SEM_EA - Used for ea_inodes only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, I_DATA_SEM_EA }; /* * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ __u32 i_dtime; ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ ext4_group_t i_block_group; ext4_lblk_t i_dir_start_lookup; #if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ #endif unsigned long i_flags; /* * Extended attributes can be read independently of the main file * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. */ struct rw_semaphore xattr_sem; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise * i_orphan is used. */ union { struct list_head i_orphan; /* unlinked but open inodes */ unsigned int i_orphan_idx; /* Index in orphan file */ }; /* Fast commit related info */ /* For tracking dentry create updates */ struct list_head i_fc_dilist; struct list_head i_fc_list; /* * inodes that need fast commit * protected by sbi->s_fc_lock. */ /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; /* * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len * and inode's EXT4_FC_STATE_COMMITTING state bit. */ spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not * in memory. During truncate, i_size is set to the new size by * the VFS prior to calling ext4_truncate(), but the filesystem won't * set i_disksize to 0 until the truncate is actually under way. * * The intent is that i_disksize always represents the blocks which * are used by this file. This allows recovery to restart truncate * on orphans if we crash during truncate. We actually write i_disksize * into the on-disk inode when writing inodes out, instead of i_size. * * The only time when i_disksize and i_size may be different is when * a truncate is in progress. The only things which change i_disksize * are ext4_get_block (growth) and ext4_truncate (shrinkth). */ loff_t i_disksize; /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's * data tree are chopped off during truncate. We can't do that in * ext4 because whenever we perform intermediate commits during * truncate, the inode and all the metadata blocks *must* be in a * consistent state which allows truncation of the orphans to restart * during recovery. Hence we must fix the get_block-vs-truncate race * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; struct inode vfs_inode; struct jbd2_inode *jinode; /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. */ struct timespec64 i_crtime; /* mballoc */ atomic_t i_prealloc_active; /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_shk_nr; /* protected by i_es_lock */ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ u64 i_es_seq; /* Change counter for extents. Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; /* on-disk additional length */ __u16 i_extra_isize; /* Indicate the inline data space. */ u16 i_inline_off; u16 i_inline_size; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; /* * Completed IOs that need unwritten extents handling and have * transaction reserved */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. */ tid_t i_sync_tid; tid_t i_datasync_tid; #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; kprojid_t i_projid; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif }; /* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* * Misc. filesystem flags */ #define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ #define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX #define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else #define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ #define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ #define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, * enable enforcement for hidden * quota files */ #define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable * enforcement for hidden quota * files */ #define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ /* * Mount flags set either automatically (could not be set by mount option) * based on per file system feature or property or in special cases such as * distinguishing between explicit mount option definition and default. */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) #define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ ~EXT4_MOUNT2_##opt #define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ EXT4_MOUNT2_##opt #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le extern void mb_set_bits(void *bm, int cur, int len); /* * Maximal mount counts between two filesystem checks */ #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ /* * Behaviour when detecting errors */ #define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ #define EXT4_ERRORS_RO 2 /* Remount fs read-only */ #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 #define EXT4_LABEL_MAX 16 /* * Structure of the super block */ struct ext4_super_block { /*00*/ __le32 s_inodes_count; /* Inodes count */ __le32 s_blocks_count_lo; /* Blocks count */ __le32 s_r_blocks_count_lo; /* Reserved blocks count */ __le32 s_free_blocks_count_lo; /* Free blocks count */ /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ __le16 s_mnt_count; /* Mount count */ __le16 s_max_mnt_count; /* Maximal mount count */ __le16 s_magic; /* Magic signature */ __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le16 s_minor_rev_level; /* minor revision level */ /*40*/ __le32 s_lastcheck; /* time of last check */ __le32 s_checkinterval; /* max. time between checks */ __le32 s_creator_os; /* OS */ __le32 s_rev_level; /* Revision level */ /*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ __le16 s_def_resgid; /* Default gid for reserved blocks */ /* * These fields are for EXT4_DYNAMIC_REV superblocks only. * * Note: the difference between the compatible feature set and * the incompatible feature set is that if there is a bit set * in the incompatible feature set that the kernel doesn't * know about, it should refuse to mount the filesystem. * * e2fsck's requirements are more strict; if it doesn't know * about a feature in either the compatible or incompatible * feature set, it must abort and not try to meddle with * things it doesn't understand... */ __le32 s_first_ino; /* First non-reserved inode */ __le16 s_inode_size; /* size of inode structure */ __le16 s_block_group_nr; /* block group # of this superblock */ __le32 s_feature_compat; /* compatible feature set */ /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ /* * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. */ /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ /*E0*/ __le32 s_journal_inum; /* inode number of journal file */ __le32 s_journal_dev; /* device number of journal file */ __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ __u8 s_encryption_level; /* versioning level for encryption */ __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active snapshot's future use */ __le32 s_snapshot_list; /* inode number of the head of the on-disk snapshot list */ #define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) __le32 s_error_count; /* number of fs errors */ __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ __u8 s_wtime_hi; __u8 s_mtime_hi; __u8 s_mkfs_time_hi; __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; __u8 s_first_error_errcode; __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ __le16 s_def_resuid_hi; __le16 s_def_resgid_hi; __le32 s_reserved[93]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) #ifdef __KERNEL__ /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 #define EXT4_ENC_UTF8_12_1 1 /* Types of ext4 journal triggers */ enum ext4_journal_trigger_type { EXT4_JTR_ORPHAN_FILE, EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ }; #define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE struct ext4_journal_trigger { struct jbd2_buffer_trigger_type tr_triggers; struct super_block *sb; }; static inline struct ext4_journal_trigger *EXT4_TRIGGER( struct jbd2_buffer_trigger_type *trigger) { return container_of(trigger, struct ext4_journal_trigger, tr_triggers); } #define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 /* Structure at the tail of orphan block */ struct ext4_orphan_block_tail { __le32 ob_magic; __le32 ob_checksum; }; static inline int ext4_inodes_per_orphan_block(struct super_block *sb) { return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / sizeof(u32); } struct ext4_orphan_block { atomic_t ob_free_entries; /* Number of free orphan entries in block */ struct buffer_head *ob_bh; /* Buffer for orphan block */ }; /* * Info about orphan file. */ struct ext4_orphan_info { int of_blocks; /* Number of orphan blocks in a file */ __u32 of_csum_seed; /* Checksum seed for orphan file */ struct ext4_orphan_block *of_binfo; /* Array with info about orphan * file blocks */ }; /* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; int s_desc_per_block_bits; int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; unsigned long s_ext4_flags; /* Ext4 superblock flags */ struct mutex s_orphan_lock; /* Protects on disk list changes */ struct list_head s_orphan; /* List of orphaned inodes in on disk list */ struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ struct ext4_system_blocks __rcu *s_system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ unsigned long s_ext_min; unsigned long s_ext_max; unsigned long s_depth_max; spinlock_t s_ext_stats_lock; unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif /* for buddy allocator */ struct ext4_group_info ** __rcu *s_group_info; struct inode *s_buddy_cache; spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; atomic_t s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; struct xarray *s_mb_avg_fragment_size; struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; unsigned int s_sb_update_sec; unsigned int s_sb_update_kb; /* where last allocation was done - for stream allocation */ ext4_group_t *s_mb_last_groups; unsigned int s_mb_nr_global_goals; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group __percpu *s_locality_groups; /* for write statistics */ unsigned long s_sectors_written_start; u64 s_kbytes_written; /* the size of zero-out chunk */ unsigned int s_extent_max_zeroout_kb; unsigned int s_log_groups_per_flex; struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; /* timeout in seconds for s_err_report; 0 disables the timer. */ unsigned long s_err_report_sec; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; /* minimum folio order of a page cache allocation */ u16 s_min_folio_order; /* supported maximum folio order, 0 means not supported */ u16 s_max_folio_order; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; /* Reclaim extents from extent status tree */ struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; atomic_t s_warning_count; atomic_t s_msg_count; /* Encryption policy for '-o test_dummy_encryption' */ struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag or between writepages ops and changing DELALLOC or * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif /* Record the errseq of the backing block device */ errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; /* Information about errors that happened during this mount */ spinlock_t s_error_lock; int s_add_error_count; int s_first_error_code; __u32 s_first_error_line; __u32 s_first_error_ino; __u64 s_first_error_block; const char *s_first_error_func; time64_t s_first_error_time; int s_last_error_code; __u32 s_last_error_line; __u32 s_last_error_ino; __u64 s_last_error_block; const char *s_last_error_func; time64_t s_last_error_time; /* * If we are in a context where we cannot update the on-disk * superblock, we queue the work here. This is used to update * the error information in the superblock, and for periodic * updates of the superblock called from the commit callback * function. */ struct work_struct s_sb_upd_work; /* Atomic write unit values in bytes */ unsigned int s_awu_min; unsigned int s_awu_max; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. */ #define FC_Q_MAIN 0 #define FC_Q_STAGING 1 struct list_head s_fc_q[2]; /* Inodes staged for fast commit * that have data changes in them. */ struct list_head s_fc_dentry_q[2]; /* directory entry updates */ unsigned int s_fc_bytes; /* * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. * * s_fc_lock can be taken from reclaim context (inode eviction) and is * thus reclaim unsafe. Use ext4_fc_lock()/ext4_fc_unlock() helpers * when acquiring / releasing the lock. */ struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif struct ext4_fc_replay_state s_fc_replay_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct ext4_inode_info *EXT4_I(struct inode *inode) { return container_of(inode, struct ext4_inode_info, vfs_inode); } static inline int ext4_writepages_down_read(struct super_block *sb) { percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_writepages_down_write(struct super_block *sb) { percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_fc_lock(struct super_block *sb) { mutex_lock(&EXT4_SB(sb)->s_fc_lock); return memalloc_nofs_save(); } static inline void ext4_fc_unlock(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); mutex_unlock(&EXT4_SB(sb)->s_fc_lock); } static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } static inline int ext4_get_resuid(struct ext4_super_block *es) { return le16_to_cpu(es->s_def_resuid) | le16_to_cpu(es->s_def_resuid_hi) << 16; } static inline int ext4_get_resgid(struct ext4_super_block *es) { return le16_to_cpu(es->s_def_resgid) | le16_to_cpu(es->s_def_resgid_hi) << 16; } /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require * rcu protection to avoid dereferencing an invalid pointer due to reassignment * - s_group_desc * - s_group_info * - s_flex_group */ #define sbi_array_rcu_deref(sbi, field, index) \ ({ \ typeof(*((sbi)->field)) _v; \ rcu_read_lock(); \ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ rcu_read_unlock(); \ _v; \ }) /* * run-time mount flags */ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) { set_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) { clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline int ext4_test_mount_flag(struct super_block *sb, int bit) { return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); } /* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 #define EXT4_SIM_BBITMAP_CRC 2 #define EXT4_SIM_IBITMAP_EIO 3 #define EXT4_SIM_IBITMAP_CRC 4 #define EXT4_SIM_INODE_EIO 5 #define EXT4_SIM_INODE_CRC 6 #define EXT4_SIM_DIRBLOCK_EIO 7 #define EXT4_SIM_DIRBLOCK_CRC 8 static inline bool ext4_simulate_fail(struct super_block *sb, unsigned long code) { #ifdef CONFIG_EXT4_DEBUG struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(sbi->s_simulate_fail == code)) { sbi->s_simulate_fail = 0; return true; } #endif return false; } /* * Error number codes for s_{first,last}_error_errno * * Linux errno numbers are architecture specific, so we need to translate * them into something which is architecture independent. We don't define * codes for all errno's; just the ones which are most likely to be the cause * of an ext4_error() call. */ #define EXT4_ERR_UNKNOWN 1 #define EXT4_ERR_EIO 2 #define EXT4_ERR_ENOMEM 3 #define EXT4_ERR_EFSBADCRC 4 #define EXT4_ERR_EFSCORRUPTED 5 #define EXT4_ERR_ENOSPC 6 #define EXT4_ERR_ENOKEY 7 #define EXT4_ERR_EROFS 8 #define EXT4_ERR_EFBIG 9 #define EXT4_ERR_EEXIST 10 #define EXT4_ERR_ERANGE 11 #define EXT4_ERR_EOVERFLOW 12 #define EXT4_ERR_EBUSY 13 #define EXT4_ERR_ENOTDIR 14 #define EXT4_ERR_ENOTEMPTY 15 #define EXT4_ERR_ESHUTDOWN 16 #define EXT4_ERR_EFAULT 17 /* * Inode dynamic state flags */ enum { EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_flag(struct inode *inode, int bit); static inline void ext4_set_inode_flag(struct inode *inode, int bit); static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_state(struct inode *inode, int bit); static inline void ext4_set_inode_state(struct inode *inode, int bit); static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { (ei)->i_state_flags = 0; } #else EXT4_INODE_BIT_FNS(state, flags, 32) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { /* We depend on the fact that callers will set i_flags */ } #endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test * macros from user land. */ #define EXT4_SB(sb) (sb) #endif static inline bool ext4_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); } #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* * Check whether the inode is tracked as orphan (either in orphan file or * orphan list). */ static inline bool ext4_inode_orphan_tracked(struct inode *inode) { return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan); } /* * Codes for operating systems */ #define EXT4_OS_LINUX 0 #define EXT4_OS_HURD 1 #define EXT4_OS_MASIX 2 #define EXT4_OS_FREEBSD 3 #define EXT4_OS_LITES 4 /* * Revision levels */ #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV #define EXT4_GOOD_OLD_INODE_SIZE 128 #define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) #define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX #define EXT4_TIMESTAMP_MIN S32_MIN /* * Feature set definitions */ #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 /* * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes * incompatible only if fast commit blocks are present in the FS. Since we * clear the journal (and thus the fast commit blocks), we don't mark FS as * incompatible. We also have a JBD2 incompat feature, which gets set when * there are fast commit blocks present in the journal. */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 /* * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When * METADATA_CSUM is set, group descriptor checksums use the same algorithm as * all other data structures' checksums. However, the METADATA_CSUM and * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 #define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 extern void ext4_update_dynamic_rev(struct super_block *sb); #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_compat &= \ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } #define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } #define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_incompat &= \ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ EXT4_FEATURE_INCOMPAT_CASEFOLD | \ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ EXT4_FEATURE_RO_COMPAT_VERITY |\ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ } EXTN_FEATURE_FUNCS(2) EXTN_FEATURE_FUNCS(3) EXTN_FEATURE_FUNCS(4) static inline bool ext4_has_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_compat != 0); } static inline bool ext4_has_ro_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); } static inline bool ext4_has_incompat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ enum { EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ }; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_ro(struct super_block *sb) { return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_state(struct super_block *sb) { if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (unlikely(ext4_emergency_ro(sb))) return -EROFS; return 0; } /* * Default values for user and/or group using reserved blocks */ #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 /* * Default project ID */ #define EXT4_DEF_PROJID 0 #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* * Default mount options */ #define EXT4_DEFM_DEBUG 0x0001 #define EXT4_DEFM_BSDGROUPS 0x0002 #define EXT4_DEFM_XATTR_USER 0x0004 #define EXT4_DEFM_ACL 0x0008 #define EXT4_DEFM_UID16 0x0010 #define EXT4_DEFM_JMODE 0x0060 #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 #define EXT4_DEFM_NOBARRIER 0x0100 #define EXT4_DEFM_BLOCK_VALIDITY 0x0200 #define EXT4_DEFM_DISCARD 0x0400 #define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ #define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) /* * Default values for superblock update */ #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ #define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 /* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 /* * Base length of the ext4 directory entry excluding the name length */ #define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __le16 name_len; /* Name length */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Encrypted Casefolded entries require saving the hash on disk. This structure * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned * boundary. */ struct ext4_dir_entry_hash { __le32 hash; __le32 minor_hash; }; /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be * bigger than 255 chars, it's safe to reclaim the extra byte for the * file_type field. */ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Access the hashes at the end of ext4_dir_entry_2 */ #define EXT4_DIRENT_HASHES(entry) \ ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); } /* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ struct ext4_dir_entry_tail { __le32 det_reserved_zero1; /* Pretend to be unused */ __le16 det_rec_len; /* 12 */ __u8 det_reserved_zero2; /* Zero name length */ __u8 det_reserved_ft; /* 0xDE, fake file type */ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; #define EXT4_DIRENT_TAIL(block, blocksize) \ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ ((blocksize) - \ sizeof(struct ext4_dir_entry_tail)))) /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ #define EXT4_FT_UNKNOWN 0 #define EXT4_FT_REG_FILE 1 #define EXT4_FT_DIR 2 #define EXT4_FT_CHRDEV 3 #define EXT4_FT_BLKDEV 4 #define EXT4_FT_FIFO 5 #define EXT4_FT_SOCK 6 #define EXT4_FT_SYMLINK 7 #define EXT4_FT_MAX 8 #define EXT4_FT_DIR_CSUM 0xDE /* * EXT4_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* * The rec_len is dependent on the type of directory. Directories that are * casefolded and encrypted need to store the hash as well, so we add room for * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should * pass NULL for dir, as those entries do not use the extra fields. */ static inline unsigned int ext4_dir_rec_len(__u8 name_len, const struct inode *dir) { int rec_len = (name_len + 8 + EXT4_DIR_ROUND); if (dir && ext4_hash_in_dirent(dir)) rec_len += sizeof(struct ext4_dir_entry_hash); return (rec_len & ~EXT4_DIR_ROUND); } static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); } /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } #ifdef __KERNEL__ /* hash info structure used by the directory hash */ struct dx_hash_info { u32 hash; u32 minor_hash; int hash_version; u32 *seed; }; /* 32 and 64 bit signed EOF for dx directories */ #define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) #define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) /* * Control parameters used by ext4_htree_next_block */ #define HASH_NB_ALWAYS 1 struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) struct qstr cf_name; #endif }; #define fname_name(p) ((p)->disk_name.name) #define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory */ struct ext4_iloc { struct buffer_head *bh; unsigned long offset; ext4_group_t block_group; }; static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) { return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } static inline bool ext4_is_quota_file(struct inode *inode) { return IS_NOQUOTA(inode) && !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); } /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do * readdir operations in hash tree order. */ struct dir_private_info { struct rb_root root; struct rb_node *curr_node; struct fname *extra_fname; loff_t last_pos; __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; u64 cookie; bool initialized; }; /* calculate the first block number of the group */ static inline ext4_fsblk_t ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) { return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); } /* * Special error return code only used by dx_probe() and its callers. */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* htree levels for ext4 */ #define EXT4_HTREE_LEVEL_COMPAT 2 #define EXT4_HTREE_LEVEL 3 static inline int ext4_dir_htree_level(struct super_block *sb) { return ext4_has_feature_largedir(sb) ? EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; } /* * Timeout and state flag for lazy initialization inode thread. */ #define EXT4_DEF_LI_WAIT_MULT 10 #define EXT4_DEF_LI_MAX_START_DELAY 5 #define EXT4_LAZYINIT_QUIT 0x0001 #define EXT4_LAZYINIT_RUNNING 0x0002 /* * Lazy inode table initialization info */ struct ext4_lazy_init { unsigned long li_state; struct list_head li_request_list; struct mutex li_list_mtx; }; enum ext4_li_mode { EXT4_LI_MODE_PREFETCH_BBITMAP, EXT4_LI_MODE_ITABLE, }; struct ext4_li_request { struct super_block *lr_super; enum ext4_li_mode lr_mode; ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; unsigned long lr_timeout; }; struct ext4_features { struct kobject f_kobj; struct completion f_kobj_unregister; }; /* * This structure will be used for multiple mount protection. It will be * written into the block number saved in the s_mmp_block field in the * superblock. Programs that check MMP should assume that if * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe * to use the filesystem, regardless of how old the timestamp is. */ #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ struct mmp_struct { __le32 mmp_magic; /* Magic number for MMP */ __le32 mmp_seq; /* Sequence no. updated periodically */ /* * mmp_time, mmp_nodename & mmp_bdevname are only used for information * purposes and do not affect the correctness of the algorithm */ __le64 mmp_time; /* Time last updated */ char mmp_nodename[64]; /* Node which last updated MMP block */ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ /* * mmp_check_interval is used to verify if the MMP block has been * updated on the block device. The value is updated based on the * maximum time to write the MMP block during an update cycle. */ __le16 mmp_check_interval; __le16 mmp_pad1; __le32 mmp_pad2[226]; __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ struct mmpd_data { struct buffer_head *bh; /* bh from initial read_mmp_block() */ struct super_block *sb; /* super block of the fs */ }; /* * Check interval multiplier * The MMP block is written every update interval and initially checked every * update interval x the multiplier (the value is then adapted based on the * write latency). The reason is that writes can be delayed under load and we * don't want readers to incorrectly assume that the filesystem is no longer * in use. */ #define EXT4_MMP_CHECK_MULT 2UL /* * Minimum interval for MMP checking in seconds. */ #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL /* * Maximum interval for MMP checking in seconds. */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL /* * Function prototypes */ /* * Ok, these declarations are also in <linux/kernel.h> but none of the * ext4 source programs needs to include it so they are duplicated here. */ # define NORET_TYPE /**/ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); /* balloc.c */ extern void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp); extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; } #else static inline int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname) { return 0; } static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { } #endif /* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION extern const struct fscrypt_operations ext4_cryptops; int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { /* ext4_iget() should have caught this... */ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; static inline unsigned char get_dtype(struct super_block *sb, int filetype) { if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; } extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* fast_commit.c */ int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry); void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt); extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; } /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line); #define ext4_iget(sb, ino, flags) \ __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); static inline bool is_special_ino(struct super_block *sb, unsigned long ino) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum); } /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size); extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern void ext4_kvfree_array_rcu(void *to_free); extern int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input); extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); extern unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); extern void print_daily_error_info(struct timer_list *t); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, int, __u64, const char *, ...); extern __printf(6, 7) void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); extern __printf(4, 5) void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, u64, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) #define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) #define ext4_error_inode_block(inode, block, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) #define ext4_abort(sb, err, fmt, a...) \ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ __ext4_error_inode((inode), (func), (line), (block), \ (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ ##__VA_ARGS__) #define ext4_error_err(sb, err, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ fmt, ##__VA_ARGS__) #else #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, 0, " "); \ } while (0) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_file(file, "", 0, block, " "); \ } while (0) #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, 0, 0, " "); \ } while (0) #define ext4_error_err(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, err, 0, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) #define ext4_warning_inode(inode, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning_inode(inode, "", 0, " "); \ } while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_msg(sb, "", " "); \ } while (0) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, "", 0, "") #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ } while (0) #endif extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); static inline int ext4_has_group_desc_csum(struct super_block *sb) { return ext4_has_feature_gdt_csum(sb) || ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ le32_to_cpu(es->name##_lo)) static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_blocks_count); } static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_r_blocks_count); } static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_free_blocks_count); } static inline void ext4_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_blocks_count_lo = cpu_to_le32((u32)blk); es->s_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline loff_t ext4_isize(struct super_block *sb, struct ext4_inode *raw_inode) { if (ext4_has_feature_largedir(sb) || S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) { raw_inode->i_size_lo = cpu_to_le32(i_size); raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() * in resize.c */ static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) { ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; smp_rmb(); return ngroups; } static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) { return block_group >> sbi->s_log_groups_per_flex; } static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) { return 1 << sbi->s_log_groups_per_flex; } static inline loff_t ext4_get_maxbytes(struct inode *inode) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return inode->i_sb->s_maxbytes; return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; } #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP /* Each CPU can accumulate percpu_counter_batch clusters in their local * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ #define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREECLUSTERS_WATERMARK 0 #endif /* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); up_write(&EXT4_I(inode)->i_data_sem); } /* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; if (newsize > inode->i_size) { i_size_write(inode, newsize); changed = 1; } if (newsize > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, newsize); changed |= 2; } return changed; } int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len); struct ext4_group_info { unsigned long bb_state; #ifdef AGGRESSIVE_CHECK unsigned long bb_check_counter; #endif struct rb_root bb_free_root; ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ int bb_avg_fragment_size_order; /* order of average fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_SET_TRIMMED(grp) \ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } /* * Returns true if the filesystem is busy enough that attempts to * access the block group locks has run into contention. */ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) { return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) { if (!spin_trylock(ext4_group_lock_ptr(sb, group))) return false; /* * We're able to grab the lock right away, so drop the lock * contention counter. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); return true; } static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); spin_lock(ext4_group_lock_ptr(sb, group)); } } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { spin_unlock(ext4_group_lock_ptr(sb, group)); } #ifdef CONFIG_QUOTA static inline bool ext4_quota_capable(struct super_block *sb) { return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); } static inline bool ext4_is_quota_journalled(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } int ext4_enable_quotas(struct super_block *sb); #endif /* * Block validity checking */ #define ext4_check_indirect_blockref(inode, bh) \ ext4_check_blockref(__func__, __LINE__, inode, \ (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_ind_check_inode(inode) \ ext4_check_blockref(__func__, __LINE__, inode, \ EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /* * Inodes and files operations */ /* dir.c */ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_generic_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop, void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, struct dir_context *ctx, int *has_inline_data); extern int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); static inline int ext4_has_inline_data(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && EXT4_I(inode)->i_inline_off; } /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, unsigned int parent_ino, void *inline_buf, int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, }; static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } /* readpages.c */ int ext4_read_folio(struct file *file, struct folio *folio); void ext4_readahead(struct readahead_control *rac); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); extern void ext4_exit_sysfs(void); /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); extern int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); /* extents.c */ struct ext4_ext_path; struct ext4_extent; /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. */ #define EXT_MAX_BLOCKS 0xffffffff extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); extern struct ext4_ext_path *ext4_ext_insert_extent( handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred); extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); extern int ext4_ext_replay_set_iblocks(struct inode *inode); extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk); extern int ext4_ext_clear_bb(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, struct inode *second); extern void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); extern int ext4_put_io_end(ext4_io_end_t *io_end); extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* mmp.c */ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; /* orphan.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es); extern void ext4_release_orphan_info(struct super_block *sb); extern int ext4_init_orphan_info(struct super_block *sb); extern int ext4_orphan_file_empty(struct super_block *sb); extern void ext4_orphan_file_block_trigger( struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size); /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ #define BH_BITMAP_UPTODATE BH_JBDPrivateStart static inline int bitmap_uptodate(struct buffer_head *bh) { return (buffer_uptodate(bh) && test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); } static inline void set_bitmap_uptodate(struct buffer_head *bh) { set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { /* * If the buffer has the write error flag, we have failed * to write out data in the block. In this case, we don't * have to read the block because we may read the old data * successfully. */ if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } static inline bool ext4_inode_can_atomic_write(struct inode *inode) { return S_ISREG(inode->i_mode) && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); #endif /* __KERNEL__ */ #endif /* _EXT4_H */
4 1 3 3 1 2 2 1 1 2 14 1 1 2 2 8 8 10 1 1 1 1 3 2 1 1 1 1 1 1 1 2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. */ #include <linux/miscdevice.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/signal.h> #include <linux/spinlock.h> #include <linux/dlm.h> #include <linux/dlm_device.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <trace/events/dlm.h> #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" #include "lvb_table.h" #include "user.h" #include "ast.h" #include "config.h" #include "memory.h" static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; static atomic_t dlm_monitor_opened; static int dlm_monitor_unused = 1; #ifdef CONFIG_COMPAT struct dlm_lock_params32 { __u8 mode; __u8 namelen; __u16 unused; __u32 flags; __u32 lkid; __u32 parent; __u64 xid; __u64 timeout; __u32 castparam; __u32 castaddr; __u32 bastparam; __u32 bastaddr; __u32 lksb; char lvb[DLM_USER_LVB_LEN]; char name[]; }; struct dlm_write_request32 { __u32 version[3]; __u8 cmd; __u8 is64bit; __u8 unused[2]; union { struct dlm_lock_params32 lock; struct dlm_lspace_params lspace; struct dlm_purge_params purge; } i; }; struct dlm_lksb32 { __u32 sb_status; __u32 sb_lkid; __u8 sb_flags; __u32 sb_lvbptr; }; struct dlm_lock_result32 { __u32 version[3]; __u32 length; __u32 user_astaddr; __u32 user_astparam; __u32 user_lksb; struct dlm_lksb32 lksb; __u8 bast_mode; __u8 unused[3]; /* Offsets may be zero if no data is present */ __u32 lvb_offset; }; static void compat_input(struct dlm_write_request *kb, struct dlm_write_request32 *kb32, int namelen) { kb->version[0] = kb32->version[0]; kb->version[1] = kb32->version[1]; kb->version[2] = kb32->version[2]; kb->cmd = kb32->cmd; kb->is64bit = kb32->is64bit; if (kb->cmd == DLM_USER_CREATE_LOCKSPACE || kb->cmd == DLM_USER_REMOVE_LOCKSPACE) { kb->i.lspace.flags = kb32->i.lspace.flags; kb->i.lspace.minor = kb32->i.lspace.minor; memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen); } else if (kb->cmd == DLM_USER_PURGE) { kb->i.purge.nodeid = kb32->i.purge.nodeid; kb->i.purge.pid = kb32->i.purge.pid; } else { kb->i.lock.mode = kb32->i.lock.mode; kb->i.lock.namelen = kb32->i.lock.namelen; kb->i.lock.flags = kb32->i.lock.flags; kb->i.lock.lkid = kb32->i.lock.lkid; kb->i.lock.parent = kb32->i.lock.parent; kb->i.lock.xid = kb32->i.lock.xid; kb->i.lock.timeout = kb32->i.lock.timeout; kb->i.lock.castparam = (__user void *)(long)kb32->i.lock.castparam; kb->i.lock.castaddr = (__user void *)(long)kb32->i.lock.castaddr; kb->i.lock.bastparam = (__user void *)(long)kb32->i.lock.bastparam; kb->i.lock.bastaddr = (__user void *)(long)kb32->i.lock.bastaddr; kb->i.lock.lksb = (__user void *)(long)kb32->i.lock.lksb; memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN); memcpy(kb->i.lock.name, kb32->i.lock.name, namelen); } } static void compat_output(struct dlm_lock_result *res, struct dlm_lock_result32 *res32) { memset(res32, 0, sizeof(*res32)); res32->version[0] = res->version[0]; res32->version[1] = res->version[1]; res32->version[2] = res->version[2]; res32->user_astaddr = (__u32)(__force long)res->user_astaddr; res32->user_astparam = (__u32)(__force long)res->user_astparam; res32->user_lksb = (__u32)(__force long)res->user_lksb; res32->bast_mode = res->bast_mode; res32->lvb_offset = res->lvb_offset; res32->length = res->length; res32->lksb.sb_status = res->lksb.sb_status; res32->lksb.sb_flags = res->lksb.sb_flags; res32->lksb.sb_lkid = res->lksb.sb_lkid; res32->lksb.sb_lvbptr = (__u32)(long)res->lksb.sb_lvbptr; } #endif /* Figure out if this lock is at the end of its life and no longer available for the application to use. The lkb still exists until the final ast is read. A lock becomes EOL in three situations: 1. a noqueue request fails with EAGAIN 2. an unlock completes with EUNLOCK 3. a cancel of a waiting request completes with ECANCEL/EDEADLK An EOL lock needs to be removed from the process's list of locks. And we can't allow any new operation on an EOL lock. This is not related to the lifetime of the lkb struct which is managed entirely by refcount. */ static int lkb_is_endoflife(int mode, int status) { switch (status) { case -DLM_EUNLOCK: return 1; case -DLM_ECANCEL: case -ETIMEDOUT: case -EDEADLK: case -EAGAIN: if (mode == DLM_LOCK_IV) return 1; break; } return 0; } /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status, uint32_t sbflags) { struct dlm_ls *ls; struct dlm_user_args *ua; struct dlm_user_proc *proc; struct dlm_callback *cb; int rv, copy_lvb; if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) return; ls = lkb->lkb_resource->res_ls; spin_lock_bh(&ls->ls_clear_proc_locks); /* If ORPHAN/DEAD flag is set, it means the process is dead so an ast can't be delivered. For ORPHAN's, dlm_clear_proc_locks() freed lkb->ua so we can't try to use it. This second check is necessary for cases where a completion ast is received for an operation that began before clear_proc_locks did its cancel/unlock. */ if (test_bit(DLM_DFL_ORPHAN_BIT, &lkb->lkb_dflags) || test_bit(DLM_IFL_DEAD_BIT, &lkb->lkb_iflags)) goto out; DLM_ASSERT(lkb->lkb_ua, dlm_print_lkb(lkb);); ua = lkb->lkb_ua; proc = ua->proc; if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL) goto out; if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status)) set_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags); spin_lock_bh(&proc->asts_spin); if (!dlm_may_skip_callback(lkb, flags, mode, status, sbflags, &copy_lvb)) { rv = dlm_get_cb(lkb, flags, mode, status, sbflags, &cb); if (!rv) { cb->copy_lvb = copy_lvb; cb->ua = *ua; cb->lkb_lksb = &cb->ua.lksb; if (copy_lvb) { memcpy(cb->lvbptr, ua->lksb.sb_lvbptr, DLM_USER_LVB_LEN); cb->lkb_lksb->sb_lvbptr = cb->lvbptr; } list_add_tail(&cb->list, &proc->asts); wake_up_interruptible(&proc->wait); } } spin_unlock_bh(&proc->asts_spin); if (test_bit(DLM_IFL_ENDOFLIFE_BIT, &lkb->lkb_iflags)) { /* N.B. spin_lock locks_spin, not asts_spin */ spin_lock_bh(&proc->locks_spin); if (!list_empty(&lkb->lkb_ownqueue)) { list_del_init(&lkb->lkb_ownqueue); dlm_put_lkb(lkb); } spin_unlock_bh(&proc->locks_spin); } out: spin_unlock_bh(&ls->ls_clear_proc_locks); } static int device_user_lock(struct dlm_user_proc *proc, struct dlm_lock_params *params) { struct dlm_ls *ls; struct dlm_user_args *ua; uint32_t lkid; int error = -ENOMEM; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; if (!params->castaddr || !params->lksb) { error = -EINVAL; goto out; } ua = kzalloc_obj(struct dlm_user_args, GFP_NOFS); if (!ua) goto out; ua->proc = proc; ua->user_lksb = params->lksb; ua->castparam = params->castparam; ua->castaddr = params->castaddr; ua->bastparam = params->bastparam; ua->bastaddr = params->bastaddr; ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb); } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, params->name, params->namelen, &lkid); if (!error) error = lkid; } else { error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen); if (!error) error = ua->lksb.sb_lkid; } out: dlm_put_lockspace(ls); return error; } static int device_user_unlock(struct dlm_user_proc *proc, struct dlm_lock_params *params) { struct dlm_ls *ls; struct dlm_user_args *ua; int error = -ENOMEM; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; ua = kzalloc_obj(struct dlm_user_args, GFP_NOFS); if (!ua) goto out; ua->proc = proc; ua->user_lksb = params->lksb; ua->castparam = params->castparam; ua->castaddr = params->castaddr; if (params->flags & DLM_LKF_CANCEL) error = dlm_user_cancel(ls, ua, params->flags, params->lkid); else error = dlm_user_unlock(ls, ua, params->flags, params->lkid, params->lvb); out: dlm_put_lockspace(ls); return error; } static int device_user_deadlock(struct dlm_user_proc *proc, struct dlm_lock_params *params) { struct dlm_ls *ls; int error; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; error = dlm_user_deadlock(ls, params->flags, params->lkid); dlm_put_lockspace(ls); return error; } static int dlm_device_register(struct dlm_ls *ls, char *name) { int error, len; /* The device is already registered. This happens when the lockspace is created multiple times from userspace. */ if (ls->ls_device.name) return 0; error = -ENOMEM; len = strlen(name) + strlen(name_prefix) + 2; ls->ls_device.name = kzalloc(len, GFP_NOFS); if (!ls->ls_device.name) goto fail; snprintf((char *)ls->ls_device.name, len, "%s_%s", name_prefix, name); ls->ls_device.fops = &device_fops; ls->ls_device.minor = MISC_DYNAMIC_MINOR; error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); /* this has to be set to NULL * to avoid a double-free in dlm_device_deregister */ ls->ls_device.name = NULL; } fail: return error; } int dlm_device_deregister(struct dlm_ls *ls) { /* The device is not registered. This happens when the lockspace was never used from userspace, or when device_create_lockspace() calls dlm_release_lockspace() after the register fails. */ if (!ls->ls_device.name) return 0; misc_deregister(&ls->ls_device); kfree(ls->ls_device.name); return 0; } static int device_user_purge(struct dlm_user_proc *proc, struct dlm_purge_params *params) { struct dlm_ls *ls; int error; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; error = dlm_user_purge(ls, proc, params->nodeid, params->pid); dlm_put_lockspace(ls); return error; } static int device_create_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name, params->flags, DLM_USER_LVB_LEN, NULL, NULL, NULL, &lockspace); if (error) return error; ls = dlm_find_lockspace_local(lockspace); if (!ls) return -ENOENT; error = dlm_device_register(ls, params->name); dlm_put_lockspace(ls); if (error) dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; return error; } static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; ls = dlm_find_lockspace_device(params->minor); if (!ls) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); /* The final dlm_release_lockspace waits for references to go to zero, so all processes will need to close their device for the ls before the release will proceed. release also calls the device_deregister above. Converting a positive return value from release to zero means that userspace won't know when its release was the final one, but it shouldn't need to know. */ error = dlm_release_lockspace(lockspace, force); if (error > 0) error = 0; return error; } /* Check the user's version matches ours */ static int check_version(struct dlm_write_request *req) { if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || (req->version[0] == DLM_DEVICE_VERSION_MAJOR && req->version[1] > DLM_DEVICE_VERSION_MINOR)) { printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " "user (%d.%d.%d) kernel (%d.%d.%d)\n", current->comm, task_pid_nr(current), req->version[0], req->version[1], req->version[2], DLM_DEVICE_VERSION_MAJOR, DLM_DEVICE_VERSION_MINOR, DLM_DEVICE_VERSION_PATCH); return -EINVAL; } return 0; } /* * device_write * * device_user_lock * dlm_user_request -> request_lock * dlm_user_convert -> convert_lock * * device_user_unlock * dlm_user_unlock -> unlock_lock * dlm_user_cancel -> cancel_lock * * device_create_lockspace * dlm_new_lockspace * * device_remove_lockspace * dlm_release_lockspace */ /* a write to a lockspace device is a lock or unlock request, a write to the control device is to create/remove a lockspace */ static ssize_t device_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct dlm_user_proc *proc = file->private_data; struct dlm_write_request *kbuf; int error; #ifdef CONFIG_COMPAT if (count < sizeof(struct dlm_write_request32)) #else if (count < sizeof(struct dlm_write_request)) #endif return -EINVAL; /* * can't compare against COMPAT/dlm_write_request32 because * we don't yet know if is64bit is zero */ if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) return -EINVAL; kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); if (check_version(kbuf)) { error = -EBADE; goto out_free; } #ifdef CONFIG_COMPAT if (!kbuf->is64bit) { struct dlm_write_request32 *k32buf; int namelen = 0; if (count > sizeof(struct dlm_write_request32)) namelen = count - sizeof(struct dlm_write_request32); k32buf = (struct dlm_write_request32 *)kbuf; /* add 1 after namelen so that the name string is terminated */ kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, GFP_NOFS); if (!kbuf) { kfree(k32buf); return -ENOMEM; } if (proc) set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags); compat_input(kbuf, k32buf, namelen); kfree(k32buf); } #endif /* do we really need this? can a write happen after a close? */ if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) && (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) { error = -EINVAL; goto out_free; } error = -EINVAL; switch (kbuf->cmd) { case DLM_USER_LOCK: if (!proc) { log_print("no locking on control device"); goto out_free; } error = device_user_lock(proc, &kbuf->i.lock); break; case DLM_USER_UNLOCK: if (!proc) { log_print("no locking on control device"); goto out_free; } error = device_user_unlock(proc, &kbuf->i.lock); break; case DLM_USER_DEADLOCK: if (!proc) { log_print("no locking on control device"); goto out_free; } error = device_user_deadlock(proc, &kbuf->i.lock); break; case DLM_USER_CREATE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); goto out_free; } error = device_create_lockspace(&kbuf->i.lspace); break; case DLM_USER_REMOVE_LOCKSPACE: if (proc) { log_print("create/remove only on control device"); goto out_free; } error = device_remove_lockspace(&kbuf->i.lspace); break; case DLM_USER_PURGE: if (!proc) { log_print("no locking on control device"); goto out_free; } error = device_user_purge(proc, &kbuf->i.purge); break; default: log_print("Unknown command passed to DLM device : %d\n", kbuf->cmd); } out_free: kfree(kbuf); return error; } /* Every process that opens the lockspace device has its own "proc" structure hanging off the open file that's used to keep track of locks owned by the process and asts that need to be delivered to the process. */ static int device_open(struct inode *inode, struct file *file) { struct dlm_user_proc *proc; struct dlm_ls *ls; ls = dlm_find_lockspace_device(iminor(inode)); if (!ls) return -ENOENT; proc = kzalloc_obj(struct dlm_user_proc, GFP_NOFS); if (!proc) { dlm_put_lockspace(ls); return -ENOMEM; } proc->lockspace = ls; INIT_LIST_HEAD(&proc->asts); INIT_LIST_HEAD(&proc->locks); INIT_LIST_HEAD(&proc->unlocking); spin_lock_init(&proc->asts_spin); spin_lock_init(&proc->locks_spin); init_waitqueue_head(&proc->wait); file->private_data = proc; return 0; } static int device_close(struct inode *inode, struct file *file) { struct dlm_user_proc *proc = file->private_data; struct dlm_ls *ls; ls = dlm_find_lockspace_local(proc->lockspace); if (!ls) return -ENOENT; set_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags); dlm_clear_proc_locks(ls, proc); /* at this point no more lkb's should exist for this lockspace, so there's no chance of dlm_user_add_ast() being called and looking for lkb->ua->proc */ kfree(proc); file->private_data = NULL; dlm_put_lockspace(ls); dlm_put_lockspace(ls); /* for the find in device_open() */ /* FIXME: AUTOFREE: if this ls is no longer used do device_remove_lockspace() */ return 0; } static int copy_result_to_user(struct dlm_user_args *ua, int compat, uint32_t flags, int mode, int copy_lvb, char __user *buf, size_t count) { #ifdef CONFIG_COMPAT struct dlm_lock_result32 result32; #endif struct dlm_lock_result result; void *resultptr; int error=0; int len; int struct_len; memset(&result, 0, sizeof(struct dlm_lock_result)); result.version[0] = DLM_DEVICE_VERSION_MAJOR; result.version[1] = DLM_DEVICE_VERSION_MINOR; result.version[2] = DLM_DEVICE_VERSION_PATCH; memcpy(&result.lksb, &ua->lksb, offsetof(struct dlm_lksb, sb_lvbptr)); result.user_lksb = ua->user_lksb; /* FIXME: dlm1 provides for the user's bastparam/addr to not be updated in a conversion unless the conversion is successful. See code in dlm_user_convert() for updating ua from ua_tmp. OpenVMS, though, notes that a new blocking AST address and parameter are set even if the conversion fails, so maybe we should just do that. */ if (flags & DLM_CB_BAST) { result.user_astaddr = ua->bastaddr; result.user_astparam = ua->bastparam; result.bast_mode = mode; } else { result.user_astaddr = ua->castaddr; result.user_astparam = ua->castparam; } #ifdef CONFIG_COMPAT if (compat) len = sizeof(struct dlm_lock_result32); else #endif len = sizeof(struct dlm_lock_result); struct_len = len; /* copy lvb to userspace if there is one, it's been updated, and the user buffer has space for it */ if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) { if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, DLM_USER_LVB_LEN)) { error = -EFAULT; goto out; } result.lvb_offset = len; len += DLM_USER_LVB_LEN; } result.length = len; resultptr = &result; #ifdef CONFIG_COMPAT if (compat) { compat_output(&result, &result32); resultptr = &result32; } #endif if (copy_to_user(buf, resultptr, struct_len)) error = -EFAULT; else error = len; out: return error; } static int copy_version_to_user(char __user *buf, size_t count) { struct dlm_device_version ver; memset(&ver, 0, sizeof(struct dlm_device_version)); ver.version[0] = DLM_DEVICE_VERSION_MAJOR; ver.version[1] = DLM_DEVICE_VERSION_MINOR; ver.version[2] = DLM_DEVICE_VERSION_PATCH; if (copy_to_user(buf, &ver, sizeof(struct dlm_device_version))) return -EFAULT; return sizeof(struct dlm_device_version); } /* a read returns a single ast described in a struct dlm_lock_result */ static ssize_t device_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct dlm_user_proc *proc = file->private_data; DECLARE_WAITQUEUE(wait, current); struct dlm_callback *cb; int rv, ret; if (count == sizeof(struct dlm_device_version)) { rv = copy_version_to_user(buf, count); return rv; } if (!proc) { log_print("non-version read from control device %zu", count); return -EINVAL; } #ifdef CONFIG_COMPAT if (count < sizeof(struct dlm_lock_result32)) #else if (count < sizeof(struct dlm_lock_result)) #endif return -EINVAL; /* do we really need this? can a read happen after a close? */ if (test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags)) return -EINVAL; spin_lock_bh(&proc->asts_spin); if (list_empty(&proc->asts)) { if (file->f_flags & O_NONBLOCK) { spin_unlock_bh(&proc->asts_spin); return -EAGAIN; } add_wait_queue(&proc->wait, &wait); repeat: set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&proc->asts) && !signal_pending(current)) { spin_unlock_bh(&proc->asts_spin); schedule(); spin_lock_bh(&proc->asts_spin); goto repeat; } set_current_state(TASK_RUNNING); remove_wait_queue(&proc->wait, &wait); if (signal_pending(current)) { spin_unlock_bh(&proc->asts_spin); return -ERESTARTSYS; } } /* if we empty lkb_callbacks, we don't want to unlock the spinlock without removing lkb_cb_list; so empty lkb_cb_list is always consistent with empty lkb_callbacks */ cb = list_first_entry(&proc->asts, struct dlm_callback, list); list_del(&cb->list); spin_unlock_bh(&proc->asts_spin); if (cb->flags & DLM_CB_BAST) { trace_dlm_bast(cb->ls_id, cb->lkb_id, cb->mode, cb->res_name, cb->res_length); } else if (cb->flags & DLM_CB_CAST) { cb->lkb_lksb->sb_status = cb->sb_status; cb->lkb_lksb->sb_flags = cb->sb_flags; trace_dlm_ast(cb->ls_id, cb->lkb_id, cb->sb_status, cb->sb_flags, cb->res_name, cb->res_length); } ret = copy_result_to_user(&cb->ua, test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), cb->flags, cb->mode, cb->copy_lvb, buf, count); dlm_free_cb(cb); return ret; } static __poll_t device_poll(struct file *file, poll_table *wait) { struct dlm_user_proc *proc = file->private_data; poll_wait(file, &proc->wait, wait); spin_lock_bh(&proc->asts_spin); if (!list_empty(&proc->asts)) { spin_unlock_bh(&proc->asts_spin); return EPOLLIN | EPOLLRDNORM; } spin_unlock_bh(&proc->asts_spin); return 0; } int dlm_user_daemon_available(void) { /* dlm_controld hasn't started (or, has started, but not properly populated configfs) */ if (!dlm_our_nodeid()) return 0; /* This is to deal with versions of dlm_controld that don't know about the monitor device. We assume that if the dlm_controld was started (above), but the monitor device was never opened, that it's an old version. dlm_controld should open the monitor device before populating configfs. */ if (dlm_monitor_unused) return 1; return atomic_read(&dlm_monitor_opened) ? 1 : 0; } static int ctl_device_open(struct inode *inode, struct file *file) { file->private_data = NULL; return 0; } static int ctl_device_close(struct inode *inode, struct file *file) { return 0; } static int monitor_device_open(struct inode *inode, struct file *file) { atomic_inc(&dlm_monitor_opened); dlm_monitor_unused = 0; return 0; } static int monitor_device_close(struct inode *inode, struct file *file) { if (atomic_dec_and_test(&dlm_monitor_opened)) dlm_stop_lockspaces(); return 0; } static const struct file_operations device_fops = { .open = device_open, .release = device_close, .read = device_read, .write = device_write, .poll = device_poll, .owner = THIS_MODULE, .llseek = noop_llseek, }; static const struct file_operations ctl_device_fops = { .open = ctl_device_open, .release = ctl_device_close, .read = device_read, .write = device_write, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice ctl_device = { .name = "dlm-control", .fops = &ctl_device_fops, .minor = MISC_DYNAMIC_MINOR, }; static const struct file_operations monitor_device_fops = { .open = monitor_device_open, .release = monitor_device_close, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice monitor_device = { .name = "dlm-monitor", .fops = &monitor_device_fops, .minor = MISC_DYNAMIC_MINOR, }; int __init dlm_user_init(void) { int error; atomic_set(&dlm_monitor_opened, 0); error = misc_register(&ctl_device); if (error) { log_print("misc_register failed for control device"); goto out; } error = misc_register(&monitor_device); if (error) { log_print("misc_register failed for monitor device"); misc_deregister(&ctl_device); } out: return error; } void dlm_user_exit(void) { misc_deregister(&ctl_device); misc_deregister(&monitor_device); }
8 8 14 14 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 // SPDX-License-Identifier: GPL-2.0-or-later /* In-software asymmetric public-key crypto subtype * * See Documentation/crypto/asymmetric-keys.rst * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) "PKEY: "fmt #include <crypto/akcipher.h> #include <crypto/public_key.h> #include <crypto/sig.h> #include <keys/asymmetric-subtype.h> #include <linux/asn1.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> MODULE_DESCRIPTION("In-software asymmetric public-key subtype"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); /* * Provide a part of a description of the key for /proc/keys. */ static void public_key_describe(const struct key *asymmetric_key, struct seq_file *m) { struct public_key *key = asymmetric_key->payload.data[asym_crypto]; if (key) seq_printf(m, "%s.%s", key->id_type, key->pkey_algo); } /* * Destroy a public key algorithm key. */ void public_key_free(struct public_key *key) { if (key) { kfree_sensitive(key->key); kfree(key->params); kfree(key); } } EXPORT_SYMBOL_GPL(public_key_free); /* * Destroy a public key algorithm key. */ static void public_key_destroy(void *payload0, void *payload3) { public_key_free(payload0); public_key_signature_free(payload3); } /* * Given a public_key, and an encoding and hash_algo to be used for signing * and/or verification with that key, determine the name of the corresponding * akcipher algorithm. Also check that encoding and hash_algo are allowed. */ static int software_key_determine_akcipher(const struct public_key *pkey, const char *encoding, const char *hash_algo, char alg_name[CRYPTO_MAX_ALG_NAME], bool *sig, enum kernel_pkey_operation op) { int n; *sig = true; if (!encoding) return -EINVAL; if (strcmp(pkey->pkey_algo, "rsa") == 0) { /* * RSA signatures usually use EMSA-PKCS1-1_5 [RFC3447 sec 8.2]. */ if (strcmp(encoding, "pkcs1") == 0) { *sig = op == kernel_pkey_sign || op == kernel_pkey_verify; if (!*sig) { /* * For encrypt/decrypt, hash_algo is not used * but allowed to be set for historic reasons. */ n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "pkcs1pad(%s)", pkey->pkey_algo); } else { if (!hash_algo) hash_algo = "none"; n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "pkcs1(%s,%s)", pkey->pkey_algo, hash_algo); } return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0; } if (strcmp(encoding, "raw") != 0) return -EINVAL; /* * Raw RSA cannot differentiate between different hash * algorithms. */ if (hash_algo) return -EINVAL; *sig = false; } else if (strncmp(pkey->pkey_algo, "ecdsa", 5) == 0) { if (strcmp(encoding, "x962") != 0 && strcmp(encoding, "p1363") != 0) return -EINVAL; /* * ECDSA signatures are taken over a raw hash, so they don't * differentiate between different hash algorithms. That means * that the verifier should hard-code a specific hash algorithm. * Unfortunately, in practice ECDSA is used with multiple SHAs, * so we have to allow all of them and not just one. */ if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "sha1") != 0 && strcmp(hash_algo, "sha224") != 0 && strcmp(hash_algo, "sha256") != 0 && strcmp(hash_algo, "sha384") != 0 && strcmp(hash_algo, "sha512") != 0 && strcmp(hash_algo, "sha3-256") != 0 && strcmp(hash_algo, "sha3-384") != 0 && strcmp(hash_algo, "sha3-512") != 0) return -EINVAL; n = snprintf(alg_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", encoding, pkey->pkey_algo); return n >= CRYPTO_MAX_ALG_NAME ? -EINVAL : 0; } else if (strcmp(pkey->pkey_algo, "ecrdsa") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "streebog256") != 0 && strcmp(hash_algo, "streebog512") != 0) return -EINVAL; } else if (strcmp(pkey->pkey_algo, "mldsa44") == 0 || strcmp(pkey->pkey_algo, "mldsa65") == 0 || strcmp(pkey->pkey_algo, "mldsa87") == 0) { if (strcmp(encoding, "raw") != 0) return -EINVAL; if (!hash_algo) return -EINVAL; if (strcmp(hash_algo, "none") != 0 && strcmp(hash_algo, "sha512") != 0) return -EINVAL; } else { /* Unknown public key algorithm */ return -ENOPKG; } if (strscpy(alg_name, pkey->pkey_algo, CRYPTO_MAX_ALG_NAME) < 0) return -EINVAL; return 0; } static u8 *pkey_pack_u32(u8 *dst, u32 val) { memcpy(dst, &val, sizeof(val)); return dst + sizeof(val); } /* * Query information about a key. */ static int software_key_query(const struct kernel_pkey_params *params, struct kernel_pkey_query *info) { struct public_key *pkey = params->key->payload.data[asym_crypto]; char alg_name[CRYPTO_MAX_ALG_NAME]; u8 *key, *ptr; int ret, len; bool issig; ret = software_key_determine_akcipher(pkey, params->encoding, params->hash_algo, alg_name, &issig, kernel_pkey_sign); if (ret < 0) return ret; key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) return -ENOMEM; memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); memset(info, 0, sizeof(*info)); if (issig) { struct crypto_sig *sig; sig = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(sig)) { ret = PTR_ERR(sig); goto error_free_key; } if (pkey->key_is_private) ret = crypto_sig_set_privkey(sig, key, pkey->keylen); else ret = crypto_sig_set_pubkey(sig, key, pkey->keylen); if (ret < 0) goto error_free_sig; len = crypto_sig_keysize(sig); info->key_size = len; info->max_sig_size = crypto_sig_maxsize(sig); info->max_data_size = crypto_sig_digestsize(sig); info->supported_ops = KEYCTL_SUPPORTS_VERIFY; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_SIGN; if (strcmp(params->encoding, "pkcs1") == 0) { info->max_enc_size = len / BITS_PER_BYTE; info->max_dec_size = len / BITS_PER_BYTE; info->supported_ops |= KEYCTL_SUPPORTS_ENCRYPT; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_DECRYPT; } error_free_sig: crypto_free_sig(sig); } else { struct crypto_akcipher *tfm; tfm = crypto_alloc_akcipher(alg_name, 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto error_free_key; } if (pkey->key_is_private) ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen); else ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen); if (ret < 0) goto error_free_akcipher; len = crypto_akcipher_maxsize(tfm); info->key_size = len * BITS_PER_BYTE; info->max_sig_size = len; info->max_data_size = len; info->max_enc_size = len; info->max_dec_size = len; info->supported_ops = KEYCTL_SUPPORTS_ENCRYPT; if (pkey->key_is_private) info->supported_ops |= KEYCTL_SUPPORTS_DECRYPT; error_free_akcipher: crypto_free_akcipher(tfm); } error_free_key: kfree_sensitive(key); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Do encryption, decryption and signing ops. */ static int software_key_eds_op(struct kernel_pkey_params *params, const void *in, void *out) { const struct public_key *pkey = params->key->payload.data[asym_crypto]; char alg_name[CRYPTO_MAX_ALG_NAME]; struct crypto_akcipher *tfm; struct crypto_sig *sig; char *key, *ptr; bool issig; int ret; pr_devel("==>%s()\n", __func__); ret = software_key_determine_akcipher(pkey, params->encoding, params->hash_algo, alg_name, &issig, params->op); if (ret < 0) return ret; key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) return -ENOMEM; memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); if (issig) { sig = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(sig)) { ret = PTR_ERR(sig); goto error_free_key; } if (pkey->key_is_private) ret = crypto_sig_set_privkey(sig, key, pkey->keylen); else ret = crypto_sig_set_pubkey(sig, key, pkey->keylen); if (ret) goto error_free_tfm; } else { tfm = crypto_alloc_akcipher(alg_name, 0, 0); if (IS_ERR(tfm)) { ret = PTR_ERR(tfm); goto error_free_key; } if (pkey->key_is_private) ret = crypto_akcipher_set_priv_key(tfm, key, pkey->keylen); else ret = crypto_akcipher_set_pub_key(tfm, key, pkey->keylen); if (ret) goto error_free_tfm; } ret = -EINVAL; /* Perform the encryption calculation. */ switch (params->op) { case kernel_pkey_encrypt: if (issig) break; ret = crypto_akcipher_sync_encrypt(tfm, in, params->in_len, out, params->out_len); break; case kernel_pkey_decrypt: if (issig) break; ret = crypto_akcipher_sync_decrypt(tfm, in, params->in_len, out, params->out_len); break; case kernel_pkey_sign: if (!issig) break; ret = crypto_sig_sign(sig, in, params->in_len, out, params->out_len); break; default: BUG(); } if (!issig && ret == 0) ret = crypto_akcipher_maxsize(tfm); error_free_tfm: if (issig) crypto_free_sig(sig); else crypto_free_akcipher(tfm); error_free_key: kfree_sensitive(key); pr_devel("<==%s() = %d\n", __func__, ret); return ret; } /* * Verify a signature using a public key. */ int public_key_verify_signature(const struct public_key *pkey, const struct public_key_signature *sig) { char alg_name[CRYPTO_MAX_ALG_NAME]; struct crypto_sig *tfm; char *key, *ptr; bool issig; int ret; pr_devel("==>%s()\n", __func__); BUG_ON(!pkey); BUG_ON(!sig); BUG_ON(!sig->s); /* * If the signature specifies a public key algorithm, it *must* match * the key's actual public key algorithm. * * Small exception: ECDSA signatures don't specify the curve, but ECDSA * keys do. So the strings can mismatch slightly in that case: * "ecdsa-nist-*" for the key, but "ecdsa" for the signature. */ if (sig->pkey_algo) { if (strcmp(pkey->pkey_algo, sig->pkey_algo) != 0 && (strncmp(pkey->pkey_algo, "ecdsa-", 6) != 0 || strcmp(sig->pkey_algo, "ecdsa") != 0)) return -EKEYREJECTED; } ret = software_key_determine_akcipher(pkey, sig->encoding, sig->hash_algo, alg_name, &issig, kernel_pkey_verify); if (ret < 0) return ret; tfm = crypto_alloc_sig(alg_name, 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); key = kmalloc(pkey->keylen + sizeof(u32) * 2 + pkey->paramlen, GFP_KERNEL); if (!key) { ret = -ENOMEM; goto error_free_tfm; } memcpy(key, pkey->key, pkey->keylen); ptr = key + pkey->keylen; ptr = pkey_pack_u32(ptr, pkey->algo); ptr = pkey_pack_u32(ptr, pkey->paramlen); memcpy(ptr, pkey->params, pkey->paramlen); if (pkey->key_is_private) ret = crypto_sig_set_privkey(tfm, key, pkey->keylen); else ret = crypto_sig_set_pubkey(tfm, key, pkey->keylen); if (ret) goto error_free_key; ret = crypto_sig_verify(tfm, sig->s, sig->s_size, sig->m, sig->m_size); error_free_key: kfree_sensitive(key); error_free_tfm: crypto_free_sig(tfm); pr_devel("<==%s() = %d\n", __func__, ret); if (WARN_ON_ONCE(ret > 0)) ret = -EINVAL; return ret; } EXPORT_SYMBOL_GPL(public_key_verify_signature); static int public_key_verify_signature_2(const struct key *key, const struct public_key_signature *sig) { const struct public_key *pk = key->payload.data[asym_crypto]; return public_key_verify_signature(pk, sig); } /* * Public key algorithm asymmetric key subtype */ struct asymmetric_key_subtype public_key_subtype = { .owner = THIS_MODULE, .name = "public_key", .name_len = sizeof("public_key") - 1, .describe = public_key_describe, .destroy = public_key_destroy, .query = software_key_query, .eds_op = software_key_eds_op, .verify_signature = public_key_verify_signature_2, }; EXPORT_SYMBOL_GPL(public_key_subtype);
2464 2474 2046 89 1967 1000 2055 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 // SPDX-License-Identifier: GPL-2.0 /* * linux/lib/kasprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/stdarg.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> /* Simplified asprintf. */ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int first, second; char *p; va_list aq; va_copy(aq, ap); first = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc_track_caller(first+1, gfp); if (!p) return NULL; second = vsnprintf(p, first+1, fmt, ap); WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", first, second, fmt); return p; } EXPORT_SYMBOL(kvasprintf); /* * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt * (or the sole vararg) points to rodata, we will then save a memory * allocation and string copy. In any case, the return value should be * freed using kfree_const(). */ const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) { if (!strchr(fmt, '%')) return kstrdup_const(fmt, gfp); if (!strcmp(fmt, "%s")) return kstrdup_const(va_arg(ap, const char*), gfp); return kvasprintf(gfp, fmt, ap); } EXPORT_SYMBOL(kvasprintf_const); char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf);
13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_simple.c Simple example of an action * * Authors: Jamal Hadi Salim (2005-8) */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } static int reset_policy(struct tc_action *a, const struct nlattr *defdata, struct tc_defact *p, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tcf_chain *goto_ch = NULL; struct tcf_defact *d; int err; err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack); if (err < 0) return err; d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return 0; } static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; struct tcf_defact *d; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy, NULL); if (err < 0) return err; if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_simp_ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_defact(*a); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; err = alloc_defdata(d, tb[TCA_DEF_DATA]); if (err < 0) goto put_chain; tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack); if (err) goto release_idr; } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static struct tc_action_ops act_simp_ops = { .kind = "simple", .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_simp_ops.net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, .exit_batch = simp_exit_net, .id = &act_simp_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); MODULE_DESCRIPTION("Simple example action"); MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; } static void __exit simp_cleanup_module(void) { tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); module_exit(simp_cleanup_module);
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 // SPDX-License-Identifier: GPL-2.0-only /* * Handshake request lifetime events * * Author: Chuck Lever <chuck.lever@oracle.com> * * Copyright (c) 2023, Oracle and/or its affiliates. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet.h> #include <linux/rhashtable.h> #include <net/sock.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include <kunit/visibility.h> #include <uapi/linux/handshake.h> #include "handshake.h" #include <trace/events/handshake.h> /* * We need both a handshake_req -> sock mapping, and a sock -> * handshake_req mapping. Both are one-to-one. * * To avoid adding another pointer field to struct sock, net/handshake * maintains a hash table, indexed by the memory address of @sock, to * find the struct handshake_req outstanding for that socket. The * reverse direction uses a simple pointer field in the handshake_req * struct. */ static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp; static const struct rhashtable_params handshake_rhash_params = { .key_len = sizeof_field(struct handshake_req, hr_sk), .key_offset = offsetof(struct handshake_req, hr_sk), .head_offset = offsetof(struct handshake_req, hr_rhash), .automatic_shrinking = true, }; int handshake_req_hash_init(void) { return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params); } void handshake_req_hash_destroy(void) { rhashtable_destroy(&handshake_rhashtbl); } struct handshake_req *handshake_req_hash_lookup(struct sock *sk) { return rhashtable_lookup_fast(&handshake_rhashtbl, &sk, handshake_rhash_params); } EXPORT_SYMBOL_IF_KUNIT(handshake_req_hash_lookup); static bool handshake_req_hash_add(struct handshake_req *req) { int ret; ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl, &req->hr_rhash, handshake_rhash_params); return ret == 0; } static void handshake_req_destroy(struct handshake_req *req) { if (req->hr_proto->hp_destroy) req->hr_proto->hp_destroy(req); rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash, handshake_rhash_params); kfree(req); } static void handshake_sk_destruct(struct sock *sk) { void (*sk_destruct)(struct sock *sk); struct handshake_req *req; req = handshake_req_hash_lookup(sk); if (!req) return; trace_handshake_destruct(sock_net(sk), req, sk); sk_destruct = req->hr_odestruct; handshake_req_destroy(req); if (sk_destruct) sk_destruct(sk); } /** * handshake_req_alloc - Allocate a handshake request * @proto: security protocol * @flags: memory allocation flags * * Returns an initialized handshake_req or NULL. */ struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto, gfp_t flags) { struct handshake_req *req; if (!proto) return NULL; if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE) return NULL; if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX) return NULL; if (!proto->hp_accept || !proto->hp_done) return NULL; req = kzalloc_flex(*req, hr_priv, proto->hp_privsize, flags); if (!req) return NULL; INIT_LIST_HEAD(&req->hr_list); req->hr_proto = proto; return req; } EXPORT_SYMBOL(handshake_req_alloc); /** * handshake_req_private - Get per-handshake private data * @req: handshake arguments * */ void *handshake_req_private(struct handshake_req *req) { return (void *)&req->hr_priv; } EXPORT_SYMBOL(handshake_req_private); static bool __add_pending_locked(struct handshake_net *hn, struct handshake_req *req) { if (WARN_ON_ONCE(!list_empty(&req->hr_list))) return false; hn->hn_pending++; list_add_tail(&req->hr_list, &hn->hn_requests); return true; } static void __remove_pending_locked(struct handshake_net *hn, struct handshake_req *req) { hn->hn_pending--; list_del_init(&req->hr_list); } /* * Returns %true if the request was found on @net's pending list, * otherwise %false. * * If @req was on a pending list, it has not yet been accepted. */ static bool remove_pending(struct handshake_net *hn, struct handshake_req *req) { bool ret = false; spin_lock(&hn->hn_lock); if (!list_empty(&req->hr_list)) { __remove_pending_locked(hn, req); ret = true; } spin_unlock(&hn->hn_lock); return ret; } struct handshake_req *handshake_req_next(struct handshake_net *hn, int class) { struct handshake_req *req, *pos; req = NULL; spin_lock(&hn->hn_lock); list_for_each_entry(pos, &hn->hn_requests, hr_list) { if (pos->hr_proto->hp_handler_class != class) continue; __remove_pending_locked(hn, pos); req = pos; break; } spin_unlock(&hn->hn_lock); return req; } EXPORT_SYMBOL_IF_KUNIT(handshake_req_next); /** * handshake_req_submit - Submit a handshake request * @sock: open socket on which to perform the handshake * @req: handshake arguments * @flags: memory allocation flags * * Return values: * %0: Request queued * %-EINVAL: Invalid argument * %-EBUSY: A handshake is already under way for this socket * %-ESRCH: No handshake agent is available * %-EAGAIN: Too many pending handshake requests * %-ENOMEM: Failed to allocate memory * %-EMSGSIZE: Failed to construct notification message * %-EOPNOTSUPP: Handshake module not initialized * * A zero return value from handshake_req_submit() means that * exactly one subsequent completion callback is guaranteed. * * A negative return value from handshake_req_submit() means that * no completion callback will be done and that @req has been * destroyed. */ int handshake_req_submit(struct socket *sock, struct handshake_req *req, gfp_t flags) { struct handshake_net *hn; struct net *net; int ret; if (!sock || !req || !sock->file) { kfree(req); return -EINVAL; } req->hr_sk = sock->sk; if (!req->hr_sk) { kfree(req); return -EINVAL; } req->hr_odestruct = req->hr_sk->sk_destruct; req->hr_sk->sk_destruct = handshake_sk_destruct; ret = -EOPNOTSUPP; net = sock_net(req->hr_sk); hn = handshake_pernet(net); if (!hn) goto out_err; ret = -EAGAIN; if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max) goto out_err; spin_lock(&hn->hn_lock); ret = -EOPNOTSUPP; if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags)) goto out_unlock; ret = -EBUSY; if (!handshake_req_hash_add(req)) goto out_unlock; if (!__add_pending_locked(hn, req)) goto out_unlock; spin_unlock(&hn->hn_lock); ret = handshake_genl_notify(net, req->hr_proto, flags); if (ret) { trace_handshake_notify_err(net, req, req->hr_sk, ret); if (remove_pending(hn, req)) goto out_err; } /* Prevent socket release while a handshake request is pending */ sock_hold(req->hr_sk); trace_handshake_submit(net, req, req->hr_sk); return 0; out_unlock: spin_unlock(&hn->hn_lock); out_err: /* Restore original destructor so socket teardown still runs on failure */ req->hr_sk->sk_destruct = req->hr_odestruct; trace_handshake_submit_err(net, req, req->hr_sk, ret); handshake_req_destroy(req); return ret; } EXPORT_SYMBOL(handshake_req_submit); void handshake_complete(struct handshake_req *req, unsigned int status, struct genl_info *info) { struct sock *sk = req->hr_sk; struct net *net = sock_net(sk); if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { trace_handshake_complete(net, req, sk, status); req->hr_proto->hp_done(req, status, info); /* Handshake request is no longer pending */ sock_put(sk); } } EXPORT_SYMBOL_IF_KUNIT(handshake_complete); /** * handshake_req_cancel - Cancel an in-progress handshake * @sk: socket on which there is an ongoing handshake * * Request cancellation races with request completion. To determine * who won, callers examine the return value from this function. * * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found */ bool handshake_req_cancel(struct sock *sk) { struct handshake_req *req; struct handshake_net *hn; struct net *net; net = sock_net(sk); req = handshake_req_hash_lookup(sk); if (!req) { trace_handshake_cancel_none(net, req, sk); return false; } hn = handshake_pernet(net); if (hn && remove_pending(hn, req)) { /* Request hadn't been accepted - mark cancelled */ if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { trace_handshake_cancel_busy(net, req, sk); return false; } goto out_true; } if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) { /* Request already completed */ trace_handshake_cancel_busy(net, req, sk); return false; } out_true: trace_handshake_cancel(net, req, sk); /* Handshake request is no longer pending */ sock_put(sk); return true; } EXPORT_SYMBOL(handshake_req_cancel);
106 106 59 36 1 3 34 4 66 53 40 86 50 67 30 26 6 85 37 23 85 54 75 106 99 2 2 4 2 6 172 126 125 55 112 4 6 30 85 79 30 30 30 4 2 82 72 13 1 72 16 20 2 2 2 20 139 57 23 20 17 70 82 28 1 39 55 26 93 88 56 10 56 20 26 26 18 26 15 33 59 59 36 23 25 22 22 100 28 104 103 17 83 17 15 24 44 104 62 66 78 118 123 2 1 76 106 3 71 41 4 159 1 5 89 2 27 4 6 25 3 49 49 2 5 45 45 45 13 45 1 23 4 30 44 21 11 32 23 13 4 43 42 2 13 2 12 12 5 172 41 43 9 9 3 10 12 42 15 4 42 34 33 10 42 42 7 42 134 110 111 170 159 126 97 126 59 121 42 26 26 26 21 24 3 25 4 25 3 25 9 24 25 27 2 2 43 10 3 1 31 14 15 26 26 13 12 10 4 4 5 24 9 8 41 43 30 43 4 43 26 43 43 43 1 43 22 43 42 40 20 42 42 42 26 2 4 49 49 42 32 49 1 21 49 49 5 7 46 13 45 49 14 21 21 31 43 45 45 45 41 23 43 5 41 27 27 26 11 42 41 42 49 48 48 45 49 49 49 49 49 48 49 7 7 7 7 7 176 33 5 28 11 45 144 44 38 5 45 33 37 37 1 37 37 35 2 37 57 8 8 37 23 23 23 2 23 22 16 14 3 15 16 4 4 41 38 11 38 37 39 38 1 39 4 33 33 33 33 33 4 4 8 8 1 1 4 8 4 22 22 51 8 53 13 57 57 57 11 56 30 49 9 1 1 28 29 46 42 22 42 42 42 8 5 8 8 8 55 1 55 55 39 33 171 182 6 105 39 170 171 172 97 199 127 154 74 155 162 6 20 180 4 3 170 163 128 21 14 42 32 143 44 7 164 164 164 161 42 162 117 71 22 163 161 34 164 46 150 126 128 115 40 28 118 182 42 164 7 163 2 111 38 43 145 183 183 182 130 128 126 128 20 12 46 25 23 137 97 159 157 94 38 185 143 178 179 151 178 132 81 59 34 40 64 180 11 8 20 12 12 23 28 5 5 5 5 34 34 81 157 13 3 2 4 4 4 4 5 179 8 183 45 182 160 125 2 3 2 123 183 1 127 165 185 56 169 23 165 112 180 174 48 180 180 45 183 182 183 182 181 181 180 12 182 56 23 111 162 110 170 182 183 183 183 32 32 31 1 32 30 2 1 1 1 19 150 151 152 130 2 41 72 41 72 71 99 41 72 49 16 7 26 1 1 132 10 105 119 7 99 14 173 7 3 5 3 46 43 44 35 9 27 3 14 44 41 8 28 6 2 39 5 27 8 21 31 4 5 30 1 56 2 47 38 1 8 47 45 17 9 8 8 18 18 4 15 13 3 9 10 13 32 6 31 28 18 18 13 2 20 13 24 11 11 5 91 21 93 76 72 72 2 23 73 72 17 26 25 34 2 36 35 32 17 26 12 138 138 22 19 43 11 27 16 16 31 13 14 5 11 12 31 14 32 20 43 43 44 142 118 79 143 67 129 129 14 1 13 2 1 1 13 1 130 129 157 160 88 145 144 120 118 1 1 30 94 92 31 93 38 25 25 113 27 120 103 60 5 116 40 41 1 9 9 44 19 17 18 19 10 20 11 18 9 2 17 17 18 18 18 18 18 18 18 18 20 12 13 17 14 17 12 17 14 3 11 11 11 11 22 2 20 1 19 18 11 14 1 58 59 56 59 59 60 57 35 71 172 172 133 127 1 63 133 86 67 30 44 43 23 111 124 46 45 1 46 44 1 1 171 153 46 151 38 33 173 171 8 160 173 14 7 20 11 147 41 41 4 127 128 99 37 111 12 39 38 16 17 17 34 10 23 7 26 25 14 21 114 114 114 1 111 111 111 84 84 84 84 84 43 12 31 41 2 31 12 11 11 11 4 7 4 7 29 1 72 62 38 54 27 30 1 31 31 31 30 20 11 31 4 2 2 4 23 39 6 1 40 41 130 131 7 94 27 83 83 9 80 1 41 1 40 41 41 1 40 41 40 2 39 33 6 3 3 3 2 28 7 74 2 28 8 6 6 65 71 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 // SPDX-License-Identifier: GPL-2.0 /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ /* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/mm.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> #include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/tcp_ecn.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> #include <linux/unaligned.h> #include <linux/errqueue.h> #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> #include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) #define REXMIT_NONE 0 /* no loss recovery to do */ #define REXMIT_LOST 1 /* retransmit packets marked lost */ #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); void clean_acked_data_flush(void) { static_key_deferred_flush(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_flush); #endif #ifdef CONFIG_CGROUP_BPF static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown && BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG); bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG); struct bpf_sock_ops_kern sock_ops; if (likely(!unknown_opt && !parse_all_opt)) return; /* The skb will be handled in the * bpf_skops_established() or * bpf_skops_write_hdr_opt(). */ switch (sk->sk_state) { case TCP_SYN_RECV: case TCP_SYN_SENT: case TCP_LISTEN: return; } sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { struct bpf_sock_ops_kern sock_ops; sock_owned_by_me(sk); memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); } #else static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) { } static void bpf_skops_established(struct sock *sk, int bpf_op, struct sk_buff *skb) { } #endif static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb, unsigned int len) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); if (!dev || len >= READ_ONCE(dev->mtu)) pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", dev ? dev->name : "Unknown driver"); rcu_read_unlock(); } /* Adapt the MSS value used to make delayed ack decision to the * real world. */ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { /* Note: divides are still a bit expensive. * For the moment, only adjust scaling_ratio * when we update icsk_ack.rcv_mss. */ if (unlikely(len != icsk->icsk_ack.rcv_mss)) { u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; u8 old_ratio = tcp_sk(sk)->scaling_ratio; do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; if (old_ratio != tcp_sk(sk)->scaling_ratio) { struct tcp_sock *tp = tcp_sk(sk); val = tcp_win_from_space(sk, sk->sk_rcvbuf); tcp_set_window_clamp(sk, val); if (tp->window_clamp < tp->rcvq_space.space) tp->rcvq_space.space = tp->window_clamp; } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE, tcp_gro_dev_warn, sk, skb, len); /* If the skb has a len of exactly 1*MSS and has the PSH bit * set then it is likely the end of an application write. So * more data may not be arriving soon, and yet the data sender * may be waiting for an ACK if cwnd-bound or using TX zero * copy. So we set ICSK_ACK_PUSHED here so that * tcp_cleanup_rbuf() will send an ACK immediately if the app * reads all of the data and is not ping-pong. If len > MSS * then this logic does not matter (and does not hurt) because * tcp_cleanup_rbuf() will always ACK immediately if the app * reads data and there is more than an MSS of unACKed data. */ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks == 0) quickacks = 2; quickacks = min(quickacks, max_quickacks); if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = quickacks; } static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk, max_quickacks); inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tcp_ecn_disabled(tp)) return; switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, * and we already seen ECT on a previous segment, * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by * tcp_data_ecn_check() when the ECN codepoint of * received TCP data contains ECT(0), ECT(1), or CE. */ if (!tcp_ecn_mode_rfc3168(tp)) break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); if (!tcp_ecn_mode_rfc3168(tp)) break; tp->ecn_flags |= TCP_ECN_SEEN; break; } } /* Returns true if the byte counters can be used */ static bool tcp_accecn_process_option(struct tcp_sock *tp, const struct sk_buff *skb, u32 delivered_bytes, int flag) { u8 estimate_ecnfield = tp->est_ecnfield; bool ambiguous_ecn_bytes_incr = false; bool first_changed = false; unsigned int optlen; bool order1, res; unsigned int i; u8 *ptr; if (tcp_accecn_opt_fail_recv(tp)) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { if (!tp->saw_accecn_opt) { /* Too late to enable after this point due to * potential counter wraps */ if (tp->bytes_sent >= (1 << 23) - 1) { u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; tcp_accecn_saw_opt_fail_recv(tp, saw_opt); } return false; } if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; return true; } return false; } ptr = skb_transport_header(skb) + tp->rx_opt.accecn; optlen = ptr[1] - 2; if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) return false; order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { tp->saw_accecn_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); } res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; u8 ecnfield; s32 delta; u32 *cnt; if (optlen < TCPOLEN_ACCECN_PERFIELD) break; ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); init_offset = tcp_accecn_field_init_offset(ecnfield); cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); if (delta && delta < 0) { res = false; ambiguous_ecn_bytes_incr = true; } if (delta && ecnfield != estimate_ecnfield) { if (!first_changed) { tp->est_ecnfield = ecnfield; first_changed = true; } else { res = false; ambiguous_ecn_bytes_incr = true; } } optlen -= TCPOLEN_ACCECN_PERFIELD; ptr += TCPOLEN_ACCECN_PERFIELD; } if (ambiguous_ecn_bytes_incr) tp->est_ecnfield = 0; return res; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; } /* Updates the delivered and delivered_ce counts */ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } #define PKTS_ACKED_WEIGHT 6 #define PKTS_ACKED_PREC 6 #define ACK_COMP_THRESH 4 /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int flag) { u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); u32 delta, safe_delta, d_ceb; bool opt_deltas_valid; u32 corrected_ace; u32 ewma; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; opt_deltas_valid = tcp_accecn_process_option(tp, skb, delivered_bytes, flag); if (delivered_pkts) { if (!tp->pkts_acked_ewma) { ewma = delivered_pkts << PKTS_ACKED_PREC; } else { ewma = tp->pkts_acked_ewma; ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) + (delivered_pkts << PKTS_ACKED_PREC)) >> PKTS_ACKED_WEIGHT; } tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU); } if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) return 0; } /* ACE field is not available during handshake */ if (flag & FLAG_SYN_ACKED) return 0; if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) return delta; safe_delta = delivered_pkts - ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); if (opt_deltas_valid) { d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; if (!d_ceb) return delta; if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && (tcp_is_sack(tp) || ((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))) { u32 est_d_cep; if (delivered_bytes <= d_ceb) return safe_delta; est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * delivered_pkts, delivered_bytes); return min(safe_delta, delta + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); } if (d_ceb > delta * tp->mss_cache) return safe_delta; if (d_ceb < safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) return delta; } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC)) return delta; return safe_delta; } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; delta = __tcp_accecn_process(sk, skb, delivered_pkts, delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; /* Recalculate header predictor */ if (tp->pred_flags) tcp_fast_path_on(tp); } return delta; } /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */ static void tcp_sndbuf_expand(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; int sndmem, per_mss; u32 nr_segs; /* Worst case is non GSO/TSO : each frame consumes one skb * and skb->head is kmalloced using power of two area of memory */ per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp)); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : * Cubic needs 1.7 factor, rounded to 2 to include * extra cushion (application might react slowly to EPOLLOUT) */ sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; sndmem *= nr_segs * per_mss; if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. */ /* Slow part of check#2. */ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, unsigned int skbtruesize) { const struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0; } /* Even if skb appears to have a bad len/truesize ratio, TCP coalescing * can play nice with us, as sk_buff and skb->head might be either * freed or shared with up to MAX_SKB_FRAGS segments. * Only give a boost to drivers using page frag(s) to hold the frame(s), * and if no payload was pulled in skb->head before reaching us. */ static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) { u32 truesize = skb->truesize; if (adjust && !skb_headlen(skb)) { truesize -= SKB_TRUESIZE(skb_end_offset(skb)); /* paranoid check, some drivers might be buggy */ if (unlikely((int)truesize < (int)skb->len)) truesize = skb->truesize; } return truesize; } static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; if (room <= 0) return; /* Check #1 */ if (!tcp_under_memory_pressure(sk)) { unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } else { /* Under pressure: * Adjust rcv_ssthresh according to reserved mem */ tcp_adjust_rcv_ssthresh(sk); } } /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win); struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { WRITE_ONCE(tp->window_clamp, maxwin); if (tcp_app_win && maxwin > 4 * tp->advmss) WRITE_ONCE(tp->window_clamp, max(maxwin - (maxwin >> tcp_app_win), 4 * tp->advmss)); } /* Force reservation of one segment. */ if (tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) WRITE_ONCE(tp->window_clamp, max(2 * tp->advmss, maxwin - tp->advmss)); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int rmem2; icsk->icsk_ack.quick = 0; rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); } /* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */ void tcp_initialize_rcv_mss(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; } EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <https://public.lanl.gov/radiant/pubs.html#DRS> * * More detail on this code can be found at * <http://staff.psc.edu/jheffner/>, * though this reference is out of date. A new paper * is pending. */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; long m = sample << 3; if (old_sample == 0 || m < old_sample) { new_sample = m; } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which * are stalled on filesystem I/O. * * Also, since we are only going for a minimum in the * non-timestamp case, we do not smooth things out * else with timestamps disabled convergence takes too * long. */ if (win_dep) return; /* Do not use this sample if receive queue is not empty. */ if (tp->rcv_nxt != tp->copied_seq) return; new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { u32 delta_us; if (tp->rcv_rtt_est.time == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); if (!delta_us) delta_us = 1; tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tp->tcp_mstamp; } static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr; if (tp->tcp_usec_ts) return delta; if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } return -1; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) return; tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { s32 delta = tcp_rtt_tsopt_us(tp, 0); if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); u32 rcvwin, rcvbuf, cap, oldval; u32 rtt_threshold, rtt_us; u64 grow; oldval = tp->rcvq_space.space; tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; /* DRS is always one RTT late. */ rcvwin = newval << 1; rtt_us = tp->rcv_rtt_est.rtt_us >> 3; rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); if (rtt_us < rtt_threshold) { /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. * It might take few additional ms to reach 'line rate', * but will avoid sk_rcvbuf inflation and poor cache use. */ grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); } else { /* slow start: allow the sender to double its rate. */ grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); } rcvwin += grow; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make the window clamp follow along. */ WRITE_ONCE(tp->window_clamp, tcp_win_from_space(sk, rcvbuf)); } } /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */ void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int time, inq, copied; trace_tcp_rcv_space_adjust(sk); if (unlikely(!tp->rcv_rtt_est.rtt_us)) return; /* We do not refresh tp->tcp_mstamp here. * Some platforms have expensive ktime_get() implementations. * Using the last cached value is enough for DRS. */ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); if (time < (tp->rcv_rtt_est.rtt_us >> 3)) return; /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; /* Number of bytes in receive queue. */ inq = tp->rcv_nxt - tp->copied_seq; copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; trace_tcp_rcvbuf_grow(sk, time); tcp_rcvbuf_grow(sk, copied); new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; } static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct inet_connection_sock *icsk = inet_csk(sk); if (skb->protocol == htons(ETH_P_IPV6)) icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb))); #endif } /* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; inet_csk_schedule_ack(sk); tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_jiffies32; if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); icsk->icsk_ack.ato = TCP_ATO_MIN; } else { int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN / 2) { /* The fastest case is the first. */ icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; } else if (m < icsk->icsk_ack.ato) { icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; if (icsk->icsk_ack.ato > icsk->icsk_rto) icsk->icsk_ack.ato = icsk->icsk_rto; } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender failed to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); } } icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt_us; /* RTT */ u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be increased quickly, decrease too quickly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if (srtt != 0) { m -= (srtt >> 3); /* m is now error in rtt est */ srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev_us >> 2); /* similar update on mdev */ } tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev_us > tp->mdev_max_us) { tp->mdev_max_us = tp->mdev_us; if (tp->mdev_max_us > tp->rttvar_us) tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max_us < tp->rttvar_us) tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max_us = tcp_rto_min_us(sk); tcp_bpf_rtt(sk, mrtt_us, srtt); } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; tcp_bpf_rtt(sk, mrtt_us, srtt); } tp->srtt_us = max(1U, srtt); } void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); /* current rate is (cwnd * mss) / srtt * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. * In Congestion Avoidance phase, set it to 120 % the current rate. * * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching * end of slow start and should slow down. */ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); rate *= max(tcp_snd_cwnd(tp), tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store * intermediate values in this location. */ WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate))); } /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: * 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exactly, which we pretend to do. */ /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ tcp_bound_rto(sk); } __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } struct tcp_sacktag_state { /* Timestamps for earliest and latest never-retransmitted segment * that was SACKed. RTO needs the earliest RTT to stay conservative, * but congestion control should still get an accurate delay signal. */ u64 first_sackt; u64 last_sackt; u32 reord; u32 sack_delivered; u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; }; /* Take a notice that peer is sending D-SACKs. Skip update of data delivery * and spurious retransmission information if this DSACK is unlikely caused by * sender's action: * - DSACKed sequence range is larger than maximum receiver's window. * - Total no. of DSACKed segments exceed the total no. of retransmitted segs. */ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, u32 end_seq, struct tcp_sacktag_state *state) { u32 seq_len, dup_segs = 1; if (!before(start_seq, end_seq)) return 0; seq_len = end_seq - start_seq; /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */ if (seq_len > tp->max_window) return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ if (tp->dsack_dups > tp->total_retrans) return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; /* We increase the RACK ordering window in rounds where we receive * DSACKs that may have been due to reordering causing RACK to trigger * a spurious fast recovery. Thus RACK ignores DSACKs that happen * without having seen reordering, or that match TLP probes (TLP * is timer-driven, not triggered by RACK). */ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ state->sack_delivered += dup_segs; return dup_segs; } /* It's reordering when higher sequence was delivered (i.e. sacked) before * some lower never-retransmitted sequence ("low_seq"). The maximum reordering * distance is approximated in full-mss packet distance ("reordering"). */ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, const int ts) { struct tcp_sock *tp = tcp_sk(sk); const u32 mss = tp->mss_cache; u32 fack, metric; fack = tcp_highest_sack_seq(tp); if (!before(low_seq, fack)) return; metric = fack - low_seq; if ((metric > tp->reordering * mss) && mss) { #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, 0, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif tp->reordering = min_t(u32, (metric + mss - 1) / mss, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering)); } /* This exciting event is worth to be remembered. 8) */ tp->reord_seen++; NET_INC_STATS(sock_net(sk), ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); } /* This must be called before lost_out or retrans_out are updated * on a new loss, because we want to know if all skbs previously * known to be lost have already been retransmitted, indicating * that this newly lost skb is our next skb to retransmit. */ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) { if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) || (tp->retransmit_skb_hint && before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->retransmit_skb_hint)->seq))) tp->retransmit_skb_hint = skb; } /* Sum the number of packets on the wire we have marked as lost, and * notify the congestion control module that the given skb was marked lost. */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { tp->lost += tcp_skb_pcount(skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) { __u8 sacked = TCP_SKB_CB(skb)->sacked; struct tcp_sock *tp = tcp_sk(sk); if (sacked & TCPCB_SACKED_ACKED) return; tcp_verify_retransmit_hint(tp, skb); if (sacked & TCPCB_LOST) { if (sacked & TCPCB_SACKED_RETRANS) { /* Account for retransmits that are lost again */ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT, tcp_skb_pcount(skb)); tcp_notify_skb_loss_event(tp, skb); } } else { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_notify_skb_loss_event(tp, skb); } } /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-circuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, u32 start_seq, u32 end_seq) { /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) return false; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) return false; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) return true; if (!is_dsack || !tp->undo_marker) return false; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (after(end_seq, tp->snd_una)) return false; if (!before(start_seq, tp->undo_marker)) return true; /* Too old */ if (!after(end_seq, tp->undo_marker)) return false; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. */ return !before(start_seq, end_seq - tp->max_window); } static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); u32 dup_segs; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1)) return false; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); } else { return false; } dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state); if (!dup_segs) { /* Skip dubious DSACK */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS); return false; } NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs); /* D-SACK for already forgotten data... Do dumb counting. */ if (tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs); return true; } /* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). * * FIXME: this could be merged to shift decision code */ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq) { int err; bool in_sack; unsigned int pkt_len; unsigned int mss; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { mss = tcp_skb_mss(skb); in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { pkt_len = start_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) pkt_len = mss; } else { pkt_len = end_seq - TCP_SKB_CB(skb)->seq; if (pkt_len < mss) return -EINVAL; } /* Round if necessary so that SACKs cover only full MSSes * and/or the remaining small portion (if present) */ if (pkt_len > mss) { unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) new_len += mss; pkt_len = new_len; } if (pkt_len >= skb->len && !in_sack) return 0; err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } return in_sack; } /* Record the most recently (re)sent time among the (s)acked packets * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from * draft-cheng-tcpm-rack-00.txt */ static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time) { u32 rtt_us; rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior * retransmission) was sacked. * * If the original is lost, there is no ambiguity. Otherwise * we assume the original can be delayed up to aRTT + min_rtt. * the aRTT term is bounded by the fast recovery or timeout, * so it's at least one RTT (i.e., retransmission is at least * an RTT later). */ return; } tp->rack.advanced = 1; tp->rack.rtt_us = rtt_us; if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, end_seq, tp->rack.end_seq)) { tp->rack.mstamp = xmit_time; tp->rack.end_seq = end_seq; } } /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, * we do not clear RETRANS, believing * that retransmission is still in flight. */ if (sacked & TCPCB_LOST) { sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); tp->lost_out -= pcount; tp->retrans_out -= pcount; } } else { if (!(sacked & TCPCB_RETRANS)) { /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ if (before(start_seq, tcp_highest_sack_seq(tp)) && before(start_seq, state->reord)) state->reord = start_seq; if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; if (state->first_sackt == 0) state->first_sackt = xmit_time; state->last_sackt = xmit_time; } if (sacked & TCPCB_LOST) { sacked &= ~TCPCB_LOST; tp->lost_out -= pcount; } } sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R * frames and clear it. undo_retrans is decreased above, L|R frames * are accounted above as well. */ if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= pcount; } return sacked; } /* The bandwidth estimator estimates the rate at which the network * can currently deliver outbound data packets for this flow. At a high * level, it operates by taking a delivery rate sample for each ACK. * * A rate sample records the rate at which the network delivered packets * for this flow, calculated over the time interval between the transmission * of a data packet and the acknowledgment of that packet. * * Specifically, over the interval between each transmit and corresponding ACK, * the estimator generates a delivery rate sample. Typically it uses the rate * at which packets were acknowledged. However, the approach of using only the * acknowledgment rate faces a challenge under the prevalent ACK decimation or * compression: packets can temporarily appear to be delivered much quicker * than the bottleneck rate. Since it is physically impossible to do that in a * sustained fashion, when the estimator notices that the ACK rate is faster * than the transmit rate, it uses the latter: * * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) * bw = min(send_rate, ack_rate) * * Notice the estimator essentially estimates the goodput, not always the * network bottleneck link rate when the sending or receiving is limited by * other factors like applications or receiver window limits. The estimator * deliberately avoids using the inter-packet spacing approach because that * approach requires a large number of samples and sophisticated filtering. * * TCP flows can often be application-limited in request/response workloads. * The estimator marks a bandwidth sample as application-limited if there * was some moment during the sampled window of packets when there was no data * ready to send in the write queue. */ /* Update the connection delivery information and generate a rate sample. */ static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; /* Clear app limited if bubble is acked and gone. */ if (tp->app_limited && after(tp->delivered, tp->app_limited)) tp->app_limited = 0; /* TODO: there are multiple places throughout tcp_ack() to get * current time. Refactor the code using a new "tcp_acktag_state" * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ /* Return an invalid sample if no timing information is available or * in recovery from loss with SACK reneging. Rate samples taken during * a SACK reneging event may overestimate bw by including packets that * were SACKed before the reneg. */ if (!rs->prior_mstamp || is_sack_reneg) { rs->delivered = -1; rs->interval_us = -1; return; } rs->delivered = tp->delivered - rs->prior_delivered; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the * longer phase. */ snd_us = rs->interval_us; /* send phase */ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Record both segment send and ack receive intervals */ rs->snd_interval_us = snd_us; rs->rcv_interval_us = ack_us; /* Normally we expect interval_us >= min-rtt. * Note that rate may still be over-estimated when a spuriously * retransmistted skb was first (s)acked because "interval_us" * is under-estimated (up to an RTT). However continuously * measuring the delivery rate during loss recovery is crucial * for connections suffer heavy or prolonged losses. */ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { if (!rs->is_retrans) pr_debug("tcp rate: %ld %d %u %u %u\n", rs->interval_us, rs->delivered, inet_csk(sk)->icsk_ca_state, tp->rx_opt.sack_ok, tcp_min_rtt(tp)); rs->interval_us = -1; return; } /* Record the last non-app-limited or the highest app-limited bw */ if (!rs->is_app_limited || ((u64)rs->delivered * tp->rate_interval_us >= (u64)tp->rate_delivered * rs->interval_us)) { tp->rate_delivered = rs->delivered; tp->rate_interval_us = rs->interval_us; tp->rate_app_limited = rs->is_app_limited; } } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) * delivery information when the skb was last transmitted. * * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is * called multiple times. We favor the information from the most recently * sent skb, i.e., the skb with the most recently sent time and the highest * sequence. */ static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); struct tcp_sock *tp = tcp_sk(sk); u64 tx_tstamp; if (!scb->tx.delivered_mstamp) return; tx_tstamp = tcp_skb_timestamp_us(skb); if (!rs->prior_delivered || tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, scb->end_seq, rs->last_end_seq)) { rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; rs->last_end_seq = scb->end_seq; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tx_tstamp; /* Find the duration of the "send phase" of this window: */ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being * used again when it's cumulatively acked. For acked packets * we don't need to reset since it'll be freed soon. */ if (scb->sacked & TCPCB_SACKED_ACKED) scb->tx.delivered_mstamp = 0; } /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); /* Adjust counters and hints for the newly sacked sequence * range but discard the return value since prev is already * marked. We must tag the range first because the seq * advancement below implicitly advances * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep * setting gso_size to something. */ if (!TCP_SKB_CB(prev)->tcp_gso_size) TCP_SKB_CB(prev)->tcp_gso_size = mss; /* CHECKME: To clear or not to clear? Mimics normal skb currently */ if (tcp_skb_pcount(skb) <= 1) TCP_SKB_CB(skb)->tcp_gso_size = 0; /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); return false; } /* Whole SKB was eaten :-) */ if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) TCP_SKB_CB(prev)->end_seq++; if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); tcp_skb_collapse_tstamp(prev, skb); if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); return true; } /* I wish gso_size would have a bit more sane initialization than * something-or-zero which complicates things */ static int tcp_skb_seglen(const struct sk_buff *skb) { return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); } /* Shifting pages past head area doesn't work */ static int skb_can_shift(const struct sk_buff *skb) { return !skb_headlen(skb) && skb_is_nonlinear(skb); } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen) { /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need * to make sure not storing more than 65535 * 8 bytes per skb, * even if current MSS is bigger. */ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) return 0; if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) return 0; return skb_shift(to, from, shiftlen); } /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; int pcount = 0; int len; int in_sack; /* Normally R but no L won't result in plain S */ if (!dup_sack && (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) goto fallback; if (!skb_can_shift(skb)) goto fallback; /* This frame is about to be dropped (was ACKed). */ if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) goto fallback; /* Can only happen with delayed DSACK + discard craziness */ prev = skb_rb_prev(skb); if (!prev) goto fallback; if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (in_sack) { len = skb->len; pcount = tcp_skb_pcount(skb); mss = tcp_skb_seglen(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; } else { if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) goto noop; /* CHECKME: This is non-MSS split case only?, this will * cause skipped skbs due to advancing loop btw, original * has that feature too */ if (tcp_skb_pcount(skb) <= 1) goto noop; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) { /* TODO: head merge to next could be attempted here * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), * though it might not be worth of the additional hassle * * ...we can probably just fallback to what was done * previously. We could try merging non-SACKed ones * as well but it probably isn't going to buy off * because later SACKs might again split them, and * it would make skb timestamp tracking considerably * harder problem. */ goto fallback; } len = end_seq - TCP_SKB_CB(skb)->seq; BUG_ON(len < 0); BUG_ON(len > skb->len); /* MSS boundaries should be honoured or else pcount will * severely break even though it makes things bit trickier. * Optimize common case to avoid most of the divides */ mss = tcp_skb_mss(skb); /* TODO: Fix DSACKs to not fragment already SACKed and we can * drop this restriction as unnecessary */ if (mss != tcp_skb_seglen(prev)) goto fallback; if (len == mss) { pcount = 1; } else if (len < mss) { goto noop; } else { pcount = len / mss; len = pcount * mss; } } /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ skb = skb_rb_next(prev); if (!skb) goto out; if (!skb_can_shift(skb) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; if (!tcp_skb_can_collapse(prev, skb)) goto out; len = skb->len; pcount = tcp_skb_pcount(skb); if (tcp_skb_shift(prev, skb, pcount, len)) tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, 0); out: return prev; noop: return skb; fallback: NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 start_seq, u32 end_seq, bool dup_sack_in) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; if (next_dup && before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { in_sack = tcp_match_skb_to_sack(sk, skb, next_dup->start_seq, next_dup->end_seq); if (in_sack > 0) dup_sack = true; } /* skb reference here is a bit tricky to get right, since * shifting can eat and free both this skb and the next, * so not even _safe variant of the loop is enough. */ if (in_sack <= 0) { tmp = tcp_shift_skb_data(sk, skb, state, start_seq, end_seq, dup_sack); if (tmp) { if (tmp != skb) { skb = tmp; continue; } in_sack = 0; } else { in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq); } } if (unlikely(in_sack < 0)) break; if (in_sack) { TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) list_del_init(&skb->tcp_tsorted_anchor); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) tcp_advance_highest_sack(sk, skb); } } return skb; } static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) { struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; struct sk_buff *skb; while (*p) { parent = *p; skb = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb)->seq)) { p = &parent->rb_left; continue; } if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { p = &parent->rb_right; continue; } return skb; } return NULL; } static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, u32 skip_to_seq) { if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) return skb; return tcp_sacktag_bsearch(sk, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, struct sock *sk, struct tcp_sack_block *next_dup, struct tcp_sacktag_state *state, u32 skip_to_seq) { if (!next_dup) return skb; if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); skb = tcp_sacktag_walk(skb, sk, NULL, state, next_dup->start_seq, next_dup->end_seq, 1); } return skb; } static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) { return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, u32 prior_snd_una, struct tcp_sacktag_state *state) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); struct tcp_sack_block sp[TCP_NUM_SACKS]; struct tcp_sack_block *cache; struct sk_buff *skb; int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); int used_sacks; bool found_dup_sack = false; int i, j; int first_sack_index; state->flag = 0; state->reord = tp->snd_nxt; if (!tp->sacked_out) tcp_highest_sack_reset(sk); found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una, state); /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can * contain valid SACK info. */ if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) return 0; if (!tp->packets_out) goto out; used_sacks = 0; first_sack_index = 0; for (i = 0; i < num_sacks; i++) { bool dup_sack = !i && found_dup_sack; sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { int mib_idx; if (dup_sack) { if (!tp->undo_marker) mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; mib_idx = LINUX_MIB_TCPSACKDISCARD; } NET_INC_STATS(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; } /* Ignore very old stuff early */ if (!after(sp[used_sacks].end_seq, prior_snd_una)) { if (i == 0) first_sack_index = -1; continue; } used_sacks++; } /* order SACK blocks to allow in order walk of the retrans queue */ for (i = used_sacks - 1; i > 0; i--) { for (j = 0; j < i; j++) { if (after(sp[j].start_seq, sp[j + 1].start_seq)) { swap(sp[j], sp[j + 1]); /* Track where the first SACK block goes to */ if (j == first_sack_index) first_sack_index = j + 1; } } } state->mss_now = tcp_current_mss(sk); skb = NULL; i = 0; if (!tp->sacked_out) { /* It's already past, so skip checking against it */ cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); } else { cache = tp->recv_sack_cache; /* Skip empty blocks in at head of the cache */ while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && !cache->end_seq) cache++; } while (i < used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; bool dup_sack = (found_dup_sack && (i == first_sack_index)); struct tcp_sack_block *next_dup = NULL; if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) cache++; /* Can skip some work by looking recv_sack_cache? */ if (tcp_sack_cache_ok(tp, cache) && !dup_sack && after(end_seq, cache->start_seq)) { /* Head todo? */ if (before(start_seq, cache->start_seq)) { skb = tcp_sacktag_skip(skb, sk, start_seq); skb = tcp_sacktag_walk(skb, sk, next_dup, state, start_seq, cache->start_seq, dup_sack); } /* Rest of the block already fully processed? */ if (!after(end_seq, cache-&